extraloop-redis-storage 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt ADDED
@@ -0,0 +1,5 @@
1
+ == 0.0.2 / 2012-20-02
2
+ * Fixed scope bug
3
+
4
+ == 0.0.1 / 2012-20-02
5
+ * Project Birthday!
data/README.rdoc ADDED
@@ -0,0 +1,57 @@
1
+ = Extraloop Redis Storage
2
+
3
+ == Description
4
+
5
+ Persistence layer for the {ExtraLoop}[https://github.com/afiore/extraloop] data extraction toolkit.
6
+ The module is implemented as a small wrapper built on top of {Ohm}[http://ohm.keyvalue.org], an object-hash mapping library which
7
+ makes storing and managing Ruby objects in Redis easy and convenient.
8
+
9
+ == Installation
10
+
11
+ gem install extraloop-redis-storage
12
+
13
+ == Usage
14
+
15
+ Extraloop's Redis storage module decorates +ExtraLoop::ScraperBase+ and +ExtraLoop::IterativeScraper+ instances
16
+ with the +set_storage+ method: a helper method that allows to specify how the scraped data should be stored.
17
+
18
+ require "extraloop/redis-storage"
19
+
20
+ class AmazonReview < ExtraLoop::Storage::Record
21
+ attribute :title
22
+ attribute :author
23
+ attribute :star
24
+
25
+ def validate
26
+ (0..5).include star.to_i or && false
27
+ end
28
+ end
29
+
30
+ scraper = AmazonReviewScraper.new("0262560992").
31
+ .set_storage(AmazonReview, "Amazon reviews of 'The Little Schemer'")
32
+ .run()
33
+
34
+ At each scraper run, the ExtraLoop storage module internally instantiates a
35
+ session (see +ExtraLoop::Storage::ScrapingSession+) and link the extracted records to it.
36
+ The +AmazonReview+ instances extracted and stored in the example above, can in fact be fetched by calling
37
+ Ohm's +find+ with the session id as argument.
38
+
39
+ reviews = AmazonReview.find :session_id => scraper.session
40
+
41
+ The same set of reviews can alternatively be retrieved by calling the +record+ method on the scraping
42
+ session instance:
43
+
44
+ reviews = scraper.session.records AmazonReview
45
+
46
+
47
+ == The #set_storage method
48
+
49
+ The +set_storage+ method can be called with the following arguments:
50
+
51
+ * _model_ A Ruby constant specifying the model to be used for storing the extracted data (optional; when this argument is not provided, a simple-non validating model will be generated on the fly).
52
+ * _session_title_ A human readable name for the extracted dataset.
53
+
54
+ == Running the test suite
55
+
56
+ The test suite can be run by executing the +rspec\ \*+ command from within the +spec/+ directory.
57
+
@@ -0,0 +1,24 @@
1
+ require "rubygems"
2
+ require "extraloop"
3
+ require "date"
4
+ require "./lib/amazon_review_scraper.rb"
5
+ require "../lib/extraloop/redis-storage.rb"
6
+
7
+
8
+ class AmazonReview < ExtraLoop::Storage::Record
9
+ attribute :title
10
+ attribute :rank
11
+ attribute :date
12
+
13
+ def validate
14
+ assert (0..5).include?(rank.to_i), "Rank not in range"
15
+ end
16
+ end
17
+
18
+ scraper = AmazonReviewScraper.new("0262560992").
19
+ set_storage(AmazonReview).
20
+ run
21
+
22
+ records = AmazonReview.find :session_id => scraper.session.id
23
+ puts "#{records.size} reviews have been created"
24
+
@@ -0,0 +1,14 @@
1
+ class AmazonReviewScraper < ExtraLoop::ScraperBase
2
+ def initialize(review_id)
3
+ url = "http://www.amazon.co.uk/product-reviews/#{review_id}/ref=dp_top_cm_cr_acr_txt?ie=UTF8&showViewpoints=1"
4
+ super(url)
5
+
6
+ loop_on("#productReviews span[class^='swSprite s_star']") do |nodes|
7
+ nodes.map { |node| node.parent.parent }
8
+ end
9
+
10
+ extract(:rank, "span[class^=swSprite]", :title) { |title| title && title.match(/^(\d)/) && $1.to_i }
11
+ extract(:title, "b")
12
+ extract(:date, "nobr") { |date| Date.parse(date.text) if date }
13
+ end
14
+ end
@@ -0,0 +1,16 @@
1
+ # Creates a simple class to store an ExtraLoop
2
+ # generated dataset using Ohm
3
+
4
+ class ExtraLoop::Storage::DatasetFactory
5
+ def initialize(classname, attributes=[])
6
+ @classname = classname.to_s.capitalize
7
+
8
+ Object.const_set(@classname, Class.new(ExtraLoop::Storage::Record) {
9
+ attributes.each { |attr| attribute attr }
10
+ })
11
+ end
12
+
13
+ def get_class
14
+ Object.const_get(@classname)
15
+ end
16
+ end
@@ -0,0 +1,44 @@
1
+ class ExtraLoop::Storage::Record < Ohm::Model
2
+ include Ohm::Boundaries
3
+ include Ohm::Timestamping
4
+
5
+ reference :session, ExtraLoop::Storage::ScrapingSession
6
+ attribute :extracted_at
7
+ index :session_id
8
+
9
+ def initialize attrs={}
10
+ self.class.send :_inherit!
11
+ super attrs
12
+ end
13
+
14
+ def self.create attrs={}
15
+ _inherit!
16
+ super attrs
17
+ end
18
+
19
+ def to_hash
20
+ super.merge(attributes.reduce({}) { |memo, attribute|
21
+ memo.merge(attribute => send(attribute))
22
+ })
23
+ end
24
+
25
+ def validate
26
+ assert_present :session
27
+ end
28
+
29
+ #
30
+ # walks up the class hierarchy and incorporate
31
+ # Ohm attributes and indices from the superclasses
32
+ #
33
+ def self._inherit!
34
+ klass = self
35
+
36
+ while klass != ExtraLoop::Storage::Record
37
+ attributes.concat(klass.superclass.attributes).uniq!
38
+ indices.concat(klass.superclass.indices).uniq!
39
+ klass = klass.superclass
40
+ end
41
+ end
42
+
43
+ private_class_method :_inherit!
44
+ end
@@ -0,0 +1,12 @@
1
+ class ExtraLoop::Storage::ScrapingSession < Ohm::Model
2
+
3
+ include Ohm::Boundaries
4
+ include Ohm::Timestamping
5
+
6
+ attribute :title
7
+
8
+ def records(collection)
9
+ Kernel.const_get(collection.to_s).
10
+ find(:session_id => self.id)
11
+ end
12
+ end
@@ -0,0 +1,30 @@
1
+ require "json"
2
+ require "rubygems"
3
+ require "redis"
4
+ require 'pry'
5
+ require "ohm"
6
+ require "ohm/contrib"
7
+ require "extraloop"
8
+
9
+ base_path = File.realpath(File.dirname(__FILE__))
10
+ $: << "#{base_path}"
11
+ require "scraper_base"
12
+
13
+
14
+
15
+ module ExtraLoop
16
+ module Storage
17
+ VERSION ||= "0.0.1"
18
+
19
+ class << self
20
+ def connect(*args)
21
+ Ohm.connect(*args)
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ ExtraLoop::Storage.autoload :Record, "#{base_path}/redis-storage/record.rb"
28
+ ExtraLoop::Storage.autoload :ScrapingSession, "#{base_path}/redis-storage/scraping_session.rb"
29
+ ExtraLoop::Storage.autoload :DatasetFactory, "#{base_path}/redis-storage/dataset_factory.rb"
30
+
@@ -0,0 +1,38 @@
1
+ class ExtraLoop::ScraperBase
2
+ attr_reader :session
3
+
4
+ def set_storage(*args)
5
+ model = args.detect { |arg| arg.is_a?(Symbol) or arg.respond_to?(:new) }
6
+ title = args.detect { |arg| arg.is_a?(String) }
7
+
8
+ collection_name = self.class.to_s.gsub(/(:)+/,'_').downcase + "_data"
9
+
10
+ title ||= collection_name
11
+ model ||= collection_name.to_sym
12
+
13
+ log_session! title
14
+
15
+ @model = model_klass = model.respond_to?(:new) && model || ExtraLoop::Storage::DatasetFactory.new(model.to_sym, @extractor_args.map(&:first)).get_class
16
+
17
+ on :data do |results|
18
+ # TODO: avoid calling send in the scraper object
19
+ results = results.map { |result| @scraper.send(:instanciate_model, result) }
20
+ block_given? && yield(results) || results.each { |result| result.save if result.respond_to?(:save) }
21
+ end
22
+ end
23
+
24
+ protected
25
+ # Creates a scraping session
26
+ def log_session!(title="")
27
+ @session ||= ExtraLoop::Storage::ScrapingSession.create :title => title
28
+ end
29
+
30
+ # Converts extracted records into instances of the dataset model specified as the first argument
31
+ # of #set_storage
32
+
33
+ def instanciate_model(record)
34
+ record_hash = record.respond_to?(:marshal_dump) ? record.marshal_dump : record
35
+ attrs = {:session => @session }.merge(record_hash)
36
+ @model.new(attrs)
37
+ end
38
+ end
@@ -0,0 +1,43 @@
1
+ load "../lib/extraloop/redis-storage.rb"
2
+
3
+ describe ExtraLoop::Storage::DatasetFactory do
4
+ Ohm.connect :url => "redis://127.0.0.1:6379/7"
5
+
6
+ describe "#get_class" do
7
+ context "with invalid input" do
8
+ before do
9
+ @factory = ExtraLoop::Storage::DatasetFactory.new(:blurb, [:a, :b, :c])
10
+ end
11
+
12
+ subject { @factory.get_class.new :a => 22, :b => 33, :c => 44 }
13
+
14
+ it { should respond_to :a }
15
+ it { should respond_to :b }
16
+ it { should respond_to :c }
17
+ it { should respond_to :save }
18
+ it { subject.valid?.should be_false }
19
+
20
+ after do
21
+ Object.send(:remove_const, :Blurb)
22
+ end
23
+ end
24
+
25
+ context "with valid input" do
26
+ before do
27
+ @factory = ExtraLoop::Storage::DatasetFactory.new(:blurb, [:a, :b, :c])
28
+ @session = ExtraLoop::Storage::ScrapingSession.create
29
+ end
30
+
31
+ subject { @factory.get_class.new :session => @session }
32
+ it { subject.valid?.should be_true }
33
+
34
+ end
35
+
36
+
37
+ after do
38
+ patterns = ["Mycollection", "ExtraLoop"]
39
+ patterns.each { |pattern| Ohm.redis.keys("*#{pattern}*").each { |key| Ohm.redis.del(key)} }
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,82 @@
1
+ load "../lib/extraloop/redis-storage.rb"
2
+
3
+ class MyRecord < ExtraLoop::Storage::Record
4
+ attribute :foo
5
+ attribute :bar
6
+ end
7
+
8
+
9
+ describe ExtraLoop::Storage::Record do
10
+
11
+ before do
12
+ @session = ExtraLoop::Storage::ScrapingSession.create
13
+ end
14
+
15
+ context "record subclasses" do
16
+
17
+ describe "#save" do
18
+
19
+ subject { MyRecord.new(:session => @session, :extracted_at => Time.now).save }
20
+
21
+ it { subject.extracted_at.should be_a_kind_of(Time) }
22
+ it { subject.session.should eql(@session) }
23
+
24
+ context "without a session attribute" do
25
+ subject { MyRecord.new }
26
+ it { subject.valid?.should_not be_true }
27
+ end
28
+ end
29
+
30
+ describe "#create" do
31
+ subject { MyRecord.create(:session => @session, :extracted_at => Time.now) }
32
+ it { subject.extracted_at.should be_a_kind_of(Time) }
33
+ it { subject.session.should eql(@session) }
34
+ end
35
+
36
+ describe "Record::last" do
37
+ before do
38
+ 2.times { MyRecord.create(:session => @session) }
39
+ end
40
+
41
+ subject { MyRecord }
42
+ it { should respond_to :last }
43
+ end
44
+
45
+ describe "#to_hash" do
46
+ before do
47
+ @record = MyRecord.create :foo => 'blurbzz', :bar => 'zzzzzz', :session => @session
48
+ end
49
+
50
+ subject { @record.to_hash }
51
+
52
+ it "should have converted its attributes as hash keys" do
53
+ [:session_id, :foo, :bar, :created_at, :updated_at].each { |key| subject.should have_key(key) and subject[key].should_not be_nil }
54
+ end
55
+
56
+ context "excluding attributes" do
57
+
58
+ before do
59
+ @user_class = Class.new(ExtraLoop::Storage::Record) do
60
+ attribute :password
61
+ attribute :username
62
+
63
+ def to_hash
64
+ super.reject { |attr| attr === :password }
65
+ end
66
+ end
67
+ end
68
+
69
+ subject { @user_class.create(:session => @session, :username => 'bob', :password => 'secret' ).to_hash }
70
+
71
+ it { should_not have_key(:password) }
72
+ it { should have_key(:username) }
73
+ it { should have_key(:session_id) }
74
+ end
75
+ end
76
+ end
77
+
78
+ after do
79
+ redis = Ohm.redis
80
+ redis.keys("[^art]*").each { |key| redis.del(key) }
81
+ end
82
+ end
@@ -0,0 +1,45 @@
1
+ load "../lib/extraloop/redis-storage.rb"
2
+
3
+
4
+ describe ExtraLoop::ScraperBase do
5
+ Ohm.connect :url => "redis://127.0.0.1:6379/7"
6
+
7
+ before(:each) do
8
+ @records = records = (1..10).to_a.map { |n| OpenStruct.new :foo => "foo#{n}" }
9
+ @scraper = ExtraLoop::ScraperBase.new("http://someurl.net").
10
+ loop_on("*").
11
+ extract(:foo).
12
+ extract(:bar)
13
+
14
+ env = ExtraLoop::ExtractionEnvironment.new(@scraper, nil, @records)
15
+
16
+ @scraper.define_singleton_method :run do
17
+ @environment = env
18
+ self.run_hook :data, [records]
19
+ end
20
+ end
21
+
22
+ describe "#set_storage" do
23
+ context "with no arguments but a block" do
24
+ before do
25
+ received_records = nil
26
+
27
+ @scraper.
28
+ set_storage { |records| received_records = records }.
29
+ run()
30
+
31
+ @received_records = received_records
32
+ end
33
+ it "all records should be openstruct instances" do
34
+ @received_records.all? { |record| record.is_a?(Extraloop_scraperbase_data) }.should be_true
35
+ end
36
+ end
37
+
38
+ context "with title argument and no block" do
39
+ before do
40
+ @scraper.set_storage "my dummy dataset"
41
+ end
42
+ end
43
+ end
44
+ end
45
+
@@ -0,0 +1,25 @@
1
+ $VERBOSE=nil
2
+ load "../lib/extraloop/redis-storage.rb"
3
+
4
+ describe ExtraLoop::Storage::ScrapingSession do
5
+ Ohm.connect :url => "redis://127.0.0.1:6379/7"
6
+
7
+ describe "#records" do
8
+ before(:each) do
9
+ my_collection = ExtraLoop::Storage::DatasetFactory.new(:MyCollection).get_class
10
+ @session = ExtraLoop::Storage::ScrapingSession.create
11
+ 5.times do
12
+ item = my_collection.create(:session => @session)
13
+ end
14
+ end
15
+
16
+ context "dataset class exists" do
17
+ context "passing a constant" do
18
+ subject { @session.records(Mycollection) }
19
+ it { should have(5).items }
20
+ it { subject.all? { |record| record.valid? }.should be_true }
21
+ end
22
+ end
23
+
24
+ end
25
+ end
metadata ADDED
@@ -0,0 +1,125 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: extraloop-redis-storage
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Andrea Fiore
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-02-20 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: extraloop
16
+ requirement: &16470780 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 0.0.3
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *16470780
25
+ - !ruby/object:Gem::Dependency
26
+ name: ohm
27
+ requirement: &16470320 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ version: 0.1.3
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *16470320
36
+ - !ruby/object:Gem::Dependency
37
+ name: ohm-contrib
38
+ requirement: &16469860 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ version: 0.1.2
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *16469860
47
+ - !ruby/object:Gem::Dependency
48
+ name: rspec
49
+ requirement: &16469400 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: 2.7.0
55
+ type: :development
56
+ prerelease: false
57
+ version_requirements: *16469400
58
+ - !ruby/object:Gem::Dependency
59
+ name: rr
60
+ requirement: &16468940 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ~>
64
+ - !ruby/object:Gem::Version
65
+ version: 1.0.4
66
+ type: :development
67
+ prerelease: false
68
+ version_requirements: *16468940
69
+ - !ruby/object:Gem::Dependency
70
+ name: pry
71
+ requirement: &16468480 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ~>
75
+ - !ruby/object:Gem::Version
76
+ version: 0.9.7.4
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: *16468480
80
+ description: Redis+Ohm based storage for data sets extracted using the ExtraLoop toolkit.
81
+ email: andrea.giulio.fiore@googlemail.com
82
+ executables: []
83
+ extensions: []
84
+ extra_rdoc_files: []
85
+ files:
86
+ - History.txt
87
+ - README.rdoc
88
+ - examples/amazon_reviews.rb
89
+ - examples/lib/amazon_review_scraper.rb
90
+ - lib/extraloop/redis-storage.rb
91
+ - lib/extraloop/redis-storage/dataset_factory.rb
92
+ - lib/extraloop/redis-storage/record.rb
93
+ - lib/extraloop/redis-storage/scraping_session.rb
94
+ - lib/extraloop/scraper_base.rb
95
+ - spec/dataset_factory_spec.rb
96
+ - spec/record_spec.rb
97
+ - spec/scraper_base_spec.rb
98
+ - spec/scraping_session_spec.rb
99
+ homepage: http://github.com/afiore/extraloop-redis-storage
100
+ licenses: []
101
+ post_install_message:
102
+ rdoc_options:
103
+ - --charset=UTF-8
104
+ require_paths:
105
+ - lib
106
+ required_ruby_version: !ruby/object:Gem::Requirement
107
+ none: false
108
+ requirements:
109
+ - - ! '>='
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ required_rubygems_version: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ requirements: []
119
+ rubyforge_project: extraloop-redis-storage
120
+ rubygems_version: 1.8.10
121
+ signing_key:
122
+ specification_version: 2
123
+ summary: Redis storage for Extraloop.
124
+ test_files: []
125
+ has_rdoc: