RubyGems - murmuring_spider - Versions diffs - 0.0.2 - Mend

murmuring_spider 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

data/.gitignore +17 -0
data/.rspec +2 -0
data/.travis.yml +3 -0
data/Gemfile +4 -0
data/Guardfile +9 -0
data/LICENSE +22 -0
data/README.md +40 -0
data/Rakefile +14 -0
data/lib/murmuring_spider.rb +15 -0
data/lib/murmuring_spider/operation.rb +82 -0
data/lib/murmuring_spider/status.rb +68 -0
data/lib/murmuring_spider/version.rb +3 -0
data/murmuring_spider.gemspec +37 -0
data/spec/murmuring_spider/operation_spec.rb +118 -0
data/spec/murmuring_spider/status_spec.rb +56 -0
data/spec/spec_helper.rb +30 -0
data/spec/twitter_search_status.dump +1 -0
data/spec/twitter_status.dump +0 -0
metadata +200 -0

data/.gitignore ADDED

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/.rspec ADDED

	@@ -0,0 +1,2 @@
1	+ --color
2	+ --format progress

data/.travis.yml ADDED

@@ -0,0 +1,3 @@
+rvm:
+  - 1.8.7 # (current default)
+  - 1.9.2

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in murmuring_spider.gemspec
+gemspec

data/Guardfile ADDED

@@ -0,0 +1,9 @@
+# A sample Guardfile
+# More info at https://github.com/guard/guard#readme
+guard 'rspec', :version => 2 do
+  watch(%r{^spec/.+_spec\.rb$})
+  watch(%r{^lib/(.+)\.rb$})     { |m| "spec/#{m[1]}_spec.rb" }
+  watch('spec/spec_helper.rb')  { "spec" }
+end

data/LICENSE ADDED

@@ -0,0 +1,22 @@
+Copyright (c) 2012 tomykaira
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,40 @@
+# MurmuringSpider
+[![Build Status](https://secure.travis-ci.org/tomykaira/murmuring_spider.png)](http://travis-ci.org/tomykaira/murmuring_spider)
+MurmuringSpider is a concise Twitter crawler.
+When we write a data-mining / text-mining application based on twitter's timeline, we have to collect and store tweets first.
+I am irritated with writing such crawler repeatedly, so I wrote this.
+What you have to do is only to add query and to run them periodically.
+Thanks to consistent Twitter API and [twitter gem](http://twitter.rubyforge.org/), it is quite easy to track various types of timelines (such as user_timeline, home_timeline, search...)
+## Installation
+Add this line to your application's Gemfile:
+    gem 'murmuring_spider'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install murmuring_spider
+## Usage
+[Usage of murmuring spider — Gist](https://gist.github.com/2060445)
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Added some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED

@@ -0,0 +1,14 @@
+#!/usr/bin/env rake
+require "bundler/gem_tasks"
+begin
+  require 'rspec/core'
+  require 'rspec/core/rake_task'
+  RSpec::Core::RakeTask.new(:spec) do |spec|
+    spec.pattern = FileList['spec/**/*_spec.rb']
+  end
+  task :default => :spec
+rescue LoadError
+  puts 'RSpec is not installed'
+end

data/lib/murmuring_spider.rb ADDED

@@ -0,0 +1,15 @@
+require 'dm-core'
+require 'dm-migrations'
+require "murmuring_spider/version"
+require "murmuring_spider/operation"
+require "murmuring_spider/status"
+module MurmuringSpider
+  extend MurmuringSpider
+  def database_init(db)
+    DataMapper.setup(:default, db)
+    DataMapper.auto_upgrade!
+  end
+end

data/lib/murmuring_spider/operation.rb ADDED

@@ -0,0 +1,82 @@
+require 'dm-core'
+require 'dm-validations'
+require 'twitter'
+#
+# Operation: represents request to Twitter
+#
+module MurmuringSpider
+  class Operation
+    include DataMapper::Resource
+    property :id, Serial
+    property :type, String
+    property :target, String
+    property :opts, Object
+    validates_uniqueness_of :target, :scope => :type
+    has n, :statuses
+    self.raise_on_save_failure = true
+    class << self
+      #
+      # Add an operation
+      # * _type_ : request type.  Name of a Twitter's method
+      # * _target_ : First argument of the Twitter's method.  Usually, an user or a operation
+      # * _opts_ : options. Second argument of the Twitter's method.
+      #
+      # returns : created Operation instance
+      #
+      # raises : DataMapper::SaveFailureError
+      #
+      def add(type, target, opts = {})
+        create(:type => type, :target => target, :opts => opts)
+      end
+      #
+      # Run all queries
+      #
+      def run_all(client = Twitter)
+        all.map { |o| o.run(client) }
+      end
+      #
+      # Remove an operation specified by type and target
+      #
+      def remove(type, target)
+        first(:type => type, :target => target).destroy
+      end
+    end
+    #
+    # Execute Twitter request and update :since_id of _opts_
+    # This method has side effect
+    #
+    # returns : Array of Twitter::Status
+    #
+    def collect_statuses(client = Twitter)
+      res = client.__send__(type, target, opts)
+      unless res.empty?
+        self.opts = opts.merge(:since_id => res.first.id)
+        save
+      end
+      res
+    end
+    #
+    # Collect tweet statuses and save them
+    # Return value should not be used
+    #
+    def run(client = Twitter)
+      collect_statuses(client).each do |s|
+        # not to raise error on save, remove an invalid status beforehand
+        last = self.statuses.new(s)
+        self.statuses.pop unless last.valid?
+      end
+      save
+    end
+  end
+end

data/lib/murmuring_spider/status.rb ADDED

@@ -0,0 +1,68 @@
+require 'dm-core'
+require 'twitter/status'
+class Twitter::Status
+  def url_expanded_text
+    if @attrs['entities'].nil?
+      text
+    else
+      @url_expanded_text ||= Array(@attrs['entities']['urls']).reduce(text) do |t, url|
+        t.gsub(url['url'], url['expanded_url'])
+      end
+    end
+  end
+end
+module MurmuringSpider
+  class Status
+    include DataMapper::Resource
+    property :id, Serial
+    property :tweet_id, String, :unique => :operation_id
+    property :text, String, :length => 255
+    property :user_id, String
+    property :screen_name, String
+    property :created_at, DateTime
+    property :extended, Object
+    belongs_to :operation
+    @@extended_fields = {}
+    class << self
+      #
+      # extend fields
+      # You can save a parameter of status which is not supported by default
+      # If block given, initializer gives the _Twitter::Status_ object to it,
+      # and the result of the given block is used as the field value
+      #
+      # * _field_ : field name. String or Symbol is expected.
+      #   _Twitter::Status_ should have the same name method.
+      # * _&b_ : block to get the field value from _Twitter::Status_ object.
+      #
+      def extend(field, &b)
+        @@extended_fields[field] = b
+        define_method(field.to_s) do
+          extended[field]
+        end
+      end
+    end
+    def initialize(s)
+      values = {}
+      @@extended_fields.each do |field, func|
+        if func
+          values[field] = func.call(s)
+        else
+          values[field] = s.__send__(field)
+        end
+      end
+      super(:tweet_id => s.id,
+          :text => s.url_expanded_text,
+          :user_id => s.user ? s.user.id : s.from_user_id,
+          :screen_name => s.user ? s.user.screen_name : s.from_user,
+          :created_at => s.created_at,
+          :extended => values)
+    end
+  end
+end

data/lib/murmuring_spider/version.rb ADDED

@@ -0,0 +1,3 @@
+module MurmuringSpider
+  VERSION = "0.0.2"
+end

data/murmuring_spider.gemspec ADDED

@@ -0,0 +1,37 @@
+# -*- encoding: utf-8 -*-
+require File.expand_path('../lib/murmuring_spider/version', __FILE__)
+Gem::Specification.new do |gem|
+  gem.authors       = ["tomykaira"]
+  gem.email         = ["tomykaira@gmail.com"]
+  gem.description   = %q{MurmuringSpider is a concise Twitter crawler.
+When we write a data-mining / text-mining application based on twitter timeline, we have to collect and store tweets first.
+I am irritated with writing such crawler repeatedly, so I wrote this.
+What you have to do is only to add query and to run them periodically.
+Thanks to consistent Twitter API and twitter gem (http://twitter.rubyforge.org/), it is quite easy to track various types of timelines (such as user_timeline, home_timeline, search...)}
+  gem.summary       = %q{MurmuringSpider is a concise Twitter crawler with DataMapper.}
+  gem.homepage      = "https://github.com/tomykaira/murmuring_spider"
+  gem.add_dependency('dm-core')
+  gem.add_dependency('dm-migrations')
+  gem.add_dependency('dm-validations')
+  gem.add_dependency('twitter')
+  gem.add_development_dependency('rspec')
+  gem.add_development_dependency('guard')
+  gem.add_development_dependency('guard-rspec')
+  gem.add_development_dependency('database_cleaner')
+  gem.add_development_dependency('dm-sqlite-adapter')
+  gem.add_development_dependency('rake')
+  gem.files         = `git ls-files`.split($\)
+  gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
+  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
+  gem.name          = "murmuring_spider"
+  gem.require_paths = ["lib"]
+  gem.version       = MurmuringSpider::VERSION
+end

data/spec/murmuring_spider/operation_spec.rb ADDED

@@ -0,0 +1,118 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+describe MurmuringSpider::Operation do
+  let(:operation) { MurmuringSpider::Operation.add(:user_timeline, 'fake-user') }
+  subject { MurmuringSpider::Operation }
+  describe 'add' do
+    context 'when an user_timeline operation is added' do
+      before { subject.add(:user_timeline, 'tomy_kaira') }
+      it { should have(1).item }
+    end
+    context 'when the same operation is added' do
+      before { subject.add(:user_timeline, 'tomy_kaira') }
+      it "should raise error" do
+        expect { subject.add(:user_timeline, 'tomy_kaira') }.to raise_error(DataMapper::SaveFailureError)
+      end
+    end
+    context 'when an operation with different types and the same target is added' do
+      before { subject.add(:user_timeline, 'tomy_kaira') }
+      it "should create new operation" do
+        subject.add(:search, 'tomy_kaira')
+        subject.count.should == 2
+      end
+    end
+  end
+  describe 'run_all' do
+    before do
+      subject.add(:user_timeline, 'fake-user')
+      subject.add(:favorite, 'fake-user2')
+      Twitter.should_receive(:user_timeline).with('fake-user', anything).and_return([])
+      Twitter.should_receive(:favorite).with('fake-user2', anything).and_return([])
+    end
+    it "should run all tasks" do
+      subject.run_all
+    end
+  end
+  describe 'remove' do
+    before do
+      operation.should_not be_nil
+    end
+    it "should remove the operation" do
+      subject.remove(:user_timeline, 'fake-user')
+      subject.get(operation.id).should be_nil
+    end
+  end
+  describe 'collect_statuses' do
+    let(:response) { [status_mock(:id => 10), status_mock(:id => 7)] }
+    before { twitter_expectation }
+    context 'when the request succeeds' do
+      subject { operation.collect_statuses }
+      it { should == response }
+    end
+    context 'when requested twice' do
+      before do
+        twitter_expectation({:since_id => 10}, [])
+        operation.collect_statuses.should == response
+      end
+      subject { MurmuringSpider::Operation.get(operation.id).collect_statuses }
+      it { should be_empty }
+    end
+  end
+  context 'when an instance of Twitter::Client is given' do
+    let(:client) { mock(Twitter::Client) }
+    it "should use the instance, not Twitter module" do
+      client.should_receive(:user_timeline).with('fake-user', anything).and_return([])
+      operation.collect_statuses(client).should == []
+    end
+  end
+  describe 'run' do
+    let(:user) { mock(Twitter::User, :id => 12345, :screen_name => 'fake-user', :name => 'fake user') }
+    let(:status) { double(:id => 10,
+                   :user => user,
+                   :text => 'test tweet',
+                   :created_at => "Fri Mar 16 09:04:34 +0000 2012").as_null_object }
+    before { twitter_expectation({}, [status]) }
+    it 'should create Status instance' do
+      operation.run
+      MurmuringSpider::Status.should have(1).item
+      status = MurmuringSpider::Status.first(:tweet_id => 10)
+      status.should_not be_nil
+      status.operation.id.should == operation.id
+    end
+    context 'when the same tweet is returned by API twice' do
+      before do
+        operation.run
+        Twitter.should_receive(:user_timeline).and_return([status])
+        operation.run
+      end
+      it 'should create only one instance' do
+        MurmuringSpider::Status.should have(1).item
+      end
+    end
+  end
+  def status_mock(opts = {})
+    mock(Twitter::Status, opts)
+  end
+  def twitter_expectation(opts = {}, resp = response)
+    Twitter.should_receive(:user_timeline).with('fake-user', opts).and_return(resp)
+  end
+end

data/spec/murmuring_spider/status_spec.rb ADDED

@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+shared_examples_for 'my tweet object' do
+  its(:tweet_id) { should == '180864326289207297' }
+  its(:text) { should == "OCaml でなにか書く課題がほしいです。プロコン的問題はのぞく" }
+  its(:user_id) { should == '287606751' }
+  its(:screen_name) { should == 'tomy_kaira' }
+  its(:created_at) { should == DateTime.parse("Sat Mar 17 03:53:10 +0000 2012") }
+end
+describe MurmuringSpider::Status do
+  let(:twitter_status) { Marshal.load(File.read(File.dirname(__FILE__) + '/../' + filename)) }
+  subject { MurmuringSpider::Status.new(twitter_status) }
+  context 'from user_timeline result' do
+    let(:filename) { 'twitter_status.dump' }
+    it_should_behave_like 'my tweet object'
+  end
+  context 'from search result' do
+    let(:filename) { 'twitter_search_status.dump' }
+    it_should_behave_like 'my tweet object'
+  end
+  context 'when the user extend the field' do
+    let(:filename) { 'twitter_status.dump' }
+    before do
+      MurmuringSpider::Status.extend(:source)
+    end
+    it_should_behave_like 'my tweet object'
+    its(:source) { should include 'web' }
+  end
+  context 'when the user extend the field with get strategy' do
+    before do
+      MurmuringSpider::Status.extend(:user_name) { |status| status.user ? status.user.name : status.from_user_name }
+    end
+    context 'with user_timeline result' do
+      let(:filename) { 'twitter_status.dump' }
+      its(:user_name) { should include 'といれ' }
+    end
+    context 'with search result' do
+      let(:filename) { 'twitter_search_status.dump' }
+      its(:user_name) { should include 'といれ' }
+    end
+  end
+  context 'when the tweet has shortened URL' do
+    let(:twitter_status) { Twitter::Status.new('text' => 'expansion test http://example.com/shortened', 'entities' => { 'urls' => [{"expanded_url"=>"http://www.example.com/expanded", "url"=>"http://example.com/shortened"}] }) }
+    subject { MurmuringSpider::Status.new(twitter_status) }
+    its(:text) { should ==  'expansion test http://www.example.com/expanded'}
+  end
+end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,30 @@
+# This file was generated by the `rspec --init` command. Conventionally, all
+# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
+# Require this file using `require "spec_helper.rb"` to ensure that it is only
+# loaded once.
+#
+# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+require 'database_cleaner'
+require 'murmuring_spider'
+$LOAD_PATH.push File.expand_path(__FILE__ + '/../lib')
+RSpec.configure do |config|
+  config.treat_symbols_as_metadata_keys_with_true_values = true
+  config.run_all_when_everything_filtered = true
+  config.filter_run :focus
+  config.before(:suite) do
+    DatabaseCleaner.strategy = :truncation
+  end
+  config.before(:each) do
+    DatabaseCleaner.start
+  end
+  config.after(:each) do
+    DatabaseCleaner.clean
+  end
+end
+MurmuringSpider.database_init('sqlite3::memory:')

data/spec/twitter_search_status.dump ADDED

	@@ -0,0 +1 @@
1	+ o:Twitter::Status:@attrs{I"created_at:ETI"$Sat, 17 Mar 2012 03:53:10 +0000;TI"from_user;TI"tomy_kaira;TI"from_user_id;Ti߇$I"from_user_id_str;TI"287606751;TI"from_user_name;TI"といれ;TI"geo;T0I"id;Tl+ 0�%��I"id_str;TI"180864326289207297;TI"iso_language_code;TI"ja;TI"

data/spec/twitter_status.dump ADDED

Binary file

metadata ADDED

@@ -0,0 +1,200 @@
+--- !ruby/object:Gem::Specification
+name: murmuring_spider
+version: !ruby/object:Gem::Version
+  prerelease:
+  version: 0.0.2
+platform: ruby
+authors:
+- tomykaira
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-04-13 00:00:00 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: dm-core
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :runtime
+  prerelease: false
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: dm-migrations
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :runtime
+  prerelease: false
+  version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: dm-validations
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :runtime
+  prerelease: false
+  version_requirements: *id003
+- !ruby/object:Gem::Dependency
+  name: twitter
+  requirement: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :runtime
+  prerelease: false
+  version_requirements: *id004
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: &id005 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :development
+  prerelease: false
+  version_requirements: *id005
+- !ruby/object:Gem::Dependency
+  name: guard
+  requirement: &id006 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :development
+  prerelease: false
+  version_requirements: *id006
+- !ruby/object:Gem::Dependency
+  name: guard-rspec
+  requirement: &id007 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :development
+  prerelease: false
+  version_requirements: *id007
+- !ruby/object:Gem::Dependency
+  name: database_cleaner
+  requirement: &id008 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :development
+  prerelease: false
+  version_requirements: *id008
+- !ruby/object:Gem::Dependency
+  name: dm-sqlite-adapter
+  requirement: &id009 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :development
+  prerelease: false
+  version_requirements: *id009
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: &id010 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :development
+  prerelease: false
+  version_requirements: *id010
+description: |-
+  MurmuringSpider is a concise Twitter crawler.
+  When we write a data-mining / text-mining application based on twitter timeline, we have to collect and store tweets first.
+  I am irritated with writing such crawler repeatedly, so I wrote this.
+  What you have to do is only to add query and to run them periodically.
+  Thanks to consistent Twitter API and twitter gem (http://twitter.rubyforge.org/), it is quite easy to track various types of timelines (such as user_timeline, home_timeline, search...)
+email:
+- tomykaira@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- .rspec
+- .travis.yml
+- Gemfile
+- Guardfile
+- LICENSE
+- README.md
+- Rakefile
+- lib/murmuring_spider.rb
+- lib/murmuring_spider/operation.rb
+- lib/murmuring_spider/status.rb
+- lib/murmuring_spider/version.rb
+- murmuring_spider.gemspec
+- spec/murmuring_spider/operation_spec.rb
+- spec/murmuring_spider/status_spec.rb
+- spec/spec_helper.rb
+- spec/twitter_search_status.dump
+- spec/twitter_status.dump
+homepage: https://github.com/tomykaira/murmuring_spider
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: -814915267
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: -814915267
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.17
+signing_key:
+specification_version: 3
+summary: MurmuringSpider is a concise Twitter crawler with DataMapper.
+test_files:
+- spec/murmuring_spider/operation_spec.rb
+- spec/murmuring_spider/status_spec.rb
+- spec/spec_helper.rb
+- spec/twitter_search_status.dump
+- spec/twitter_status.dump