extraloop-redis-storage 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt ADDED
@@ -0,0 +1,5 @@
1
+ == 0.0.2 / 2012-20-02
2
+ * Fixed scope bug
3
+
4
+ == 0.0.1 / 2012-20-02
5
+ * Project Birthday!
data/README.rdoc ADDED
@@ -0,0 +1,57 @@
1
+ = Extraloop Redis Storage
2
+
3
+ == Description
4
+
5
+ Persistence layer for the {ExtraLoop}[https://github.com/afiore/extraloop] data extraction toolkit.
6
+ The module is implemented as a small wrapper built on top of {Ohm}[http://ohm.keyvalue.org], an object-hash mapping library which
7
+ makes storing and managing Ruby objects in Redis easy and convenient.
8
+
9
+ == Installation
10
+
11
+ gem install extraloop-redis-storage
12
+
13
+ == Usage
14
+
15
+ Extraloop's Redis storage module decorates +ExtraLoop::ScraperBase+ and +ExtraLoop::IterativeScraper+ instances
16
+ with the +set_storage+ method: a helper method that allows to specify how the scraped data should be stored.
17
+
18
+ require "extraloop/redis-storage"
19
+
20
+ class AmazonReview < ExtraLoop::Storage::Record
21
+ attribute :title
22
+ attribute :author
23
+ attribute :star
24
+
25
+ def validate
26
+ (0..5).include star.to_i or && false
27
+ end
28
+ end
29
+
30
+ scraper = AmazonReviewScraper.new("0262560992").
31
+ .set_storage(AmazonReview, "Amazon reviews of 'The Little Schemer'")
32
+ .run()
33
+
34
+ At each scraper run, the ExtraLoop storage module internally instantiates a
35
+ session (see +ExtraLoop::Storage::ScrapingSession+) and link the extracted records to it.
36
+ The +AmazonReview+ instances extracted and stored in the example above, can in fact be fetched by calling
37
+ Ohm's +find+ with the session id as argument.
38
+
39
+ reviews = AmazonReview.find :session_id => scraper.session
40
+
41
+ The same set of reviews can alternatively be retrieved by calling the +record+ method on the scraping
42
+ session instance:
43
+
44
+ reviews = scraper.session.records AmazonReview
45
+
46
+
47
+ == The #set_storage method
48
+
49
+ The +set_storage+ method can be called with the following arguments:
50
+
51
+ * _model_ A Ruby constant specifying the model to be used for storing the extracted data (optional; when this argument is not provided, a simple-non validating model will be generated on the fly).
52
+ * _session_title_ A human readable name for the extracted dataset.
53
+
54
+ == Running the test suite
55
+
56
+ The test suite can be run by executing the +rspec\ \*+ command from within the +spec/+ directory.
57
+
@@ -0,0 +1,24 @@
1
+ require "rubygems"
2
+ require "extraloop"
3
+ require "date"
4
+ require "./lib/amazon_review_scraper.rb"
5
+ require "../lib/extraloop/redis-storage.rb"
6
+
7
+
8
+ class AmazonReview < ExtraLoop::Storage::Record
9
+ attribute :title
10
+ attribute :rank
11
+ attribute :date
12
+
13
+ def validate
14
+ assert (0..5).include?(rank.to_i), "Rank not in range"
15
+ end
16
+ end
17
+
18
+ scraper = AmazonReviewScraper.new("0262560992").
19
+ set_storage(AmazonReview).
20
+ run
21
+
22
+ records = AmazonReview.find :session_id => scraper.session.id
23
+ puts "#{records.size} reviews have been created"
24
+
@@ -0,0 +1,14 @@
1
+ class AmazonReviewScraper < ExtraLoop::ScraperBase
2
+ def initialize(review_id)
3
+ url = "http://www.amazon.co.uk/product-reviews/#{review_id}/ref=dp_top_cm_cr_acr_txt?ie=UTF8&showViewpoints=1"
4
+ super(url)
5
+
6
+ loop_on("#productReviews span[class^='swSprite s_star']") do |nodes|
7
+ nodes.map { |node| node.parent.parent }
8
+ end
9
+
10
+ extract(:rank, "span[class^=swSprite]", :title) { |title| title && title.match(/^(\d)/) && $1.to_i }
11
+ extract(:title, "b")
12
+ extract(:date, "nobr") { |date| Date.parse(date.text) if date }
13
+ end
14
+ end
@@ -0,0 +1,16 @@
1
+ # Creates a simple class to store an ExtraLoop
2
+ # generated dataset using Ohm
3
+
4
+ class ExtraLoop::Storage::DatasetFactory
5
+ def initialize(classname, attributes=[])
6
+ @classname = classname.to_s.capitalize
7
+
8
+ Object.const_set(@classname, Class.new(ExtraLoop::Storage::Record) {
9
+ attributes.each { |attr| attribute attr }
10
+ })
11
+ end
12
+
13
+ def get_class
14
+ Object.const_get(@classname)
15
+ end
16
+ end
@@ -0,0 +1,44 @@
1
+ class ExtraLoop::Storage::Record < Ohm::Model
2
+ include Ohm::Boundaries
3
+ include Ohm::Timestamping
4
+
5
+ reference :session, ExtraLoop::Storage::ScrapingSession
6
+ attribute :extracted_at
7
+ index :session_id
8
+
9
+ def initialize attrs={}
10
+ self.class.send :_inherit!
11
+ super attrs
12
+ end
13
+
14
+ def self.create attrs={}
15
+ _inherit!
16
+ super attrs
17
+ end
18
+
19
+ def to_hash
20
+ super.merge(attributes.reduce({}) { |memo, attribute|
21
+ memo.merge(attribute => send(attribute))
22
+ })
23
+ end
24
+
25
+ def validate
26
+ assert_present :session
27
+ end
28
+
29
+ #
30
+ # walks up the class hierarchy and incorporate
31
+ # Ohm attributes and indices from the superclasses
32
+ #
33
+ def self._inherit!
34
+ klass = self
35
+
36
+ while klass != ExtraLoop::Storage::Record
37
+ attributes.concat(klass.superclass.attributes).uniq!
38
+ indices.concat(klass.superclass.indices).uniq!
39
+ klass = klass.superclass
40
+ end
41
+ end
42
+
43
+ private_class_method :_inherit!
44
+ end
@@ -0,0 +1,12 @@
1
+ class ExtraLoop::Storage::ScrapingSession < Ohm::Model
2
+
3
+ include Ohm::Boundaries
4
+ include Ohm::Timestamping
5
+
6
+ attribute :title
7
+
8
+ def records(collection)
9
+ Kernel.const_get(collection.to_s).
10
+ find(:session_id => self.id)
11
+ end
12
+ end
@@ -0,0 +1,30 @@
1
+ require "json"
2
+ require "rubygems"
3
+ require "redis"
4
+ require 'pry'
5
+ require "ohm"
6
+ require "ohm/contrib"
7
+ require "extraloop"
8
+
9
+ base_path = File.realpath(File.dirname(__FILE__))
10
+ $: << "#{base_path}"
11
+ require "scraper_base"
12
+
13
+
14
+
15
+ module ExtraLoop
16
+ module Storage
17
+ VERSION ||= "0.0.1"
18
+
19
+ class << self
20
+ def connect(*args)
21
+ Ohm.connect(*args)
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ ExtraLoop::Storage.autoload :Record, "#{base_path}/redis-storage/record.rb"
28
+ ExtraLoop::Storage.autoload :ScrapingSession, "#{base_path}/redis-storage/scraping_session.rb"
29
+ ExtraLoop::Storage.autoload :DatasetFactory, "#{base_path}/redis-storage/dataset_factory.rb"
30
+
@@ -0,0 +1,38 @@
1
+ class ExtraLoop::ScraperBase
2
+ attr_reader :session
3
+
4
+ def set_storage(*args)
5
+ model = args.detect { |arg| arg.is_a?(Symbol) or arg.respond_to?(:new) }
6
+ title = args.detect { |arg| arg.is_a?(String) }
7
+
8
+ collection_name = self.class.to_s.gsub(/(:)+/,'_').downcase + "_data"
9
+
10
+ title ||= collection_name
11
+ model ||= collection_name.to_sym
12
+
13
+ log_session! title
14
+
15
+ @model = model_klass = model.respond_to?(:new) && model || ExtraLoop::Storage::DatasetFactory.new(model.to_sym, @extractor_args.map(&:first)).get_class
16
+
17
+ on :data do |results|
18
+ # TODO: avoid calling send in the scraper object
19
+ results = results.map { |result| @scraper.send(:instanciate_model, result) }
20
+ block_given? && yield(results) || results.each { |result| result.save if result.respond_to?(:save) }
21
+ end
22
+ end
23
+
24
+ protected
25
+ # Creates a scraping session
26
+ def log_session!(title="")
27
+ @session ||= ExtraLoop::Storage::ScrapingSession.create :title => title
28
+ end
29
+
30
+ # Converts extracted records into instances of the dataset model specified as the first argument
31
+ # of #set_storage
32
+
33
+ def instanciate_model(record)
34
+ record_hash = record.respond_to?(:marshal_dump) ? record.marshal_dump : record
35
+ attrs = {:session => @session }.merge(record_hash)
36
+ @model.new(attrs)
37
+ end
38
+ end
@@ -0,0 +1,43 @@
1
+ load "../lib/extraloop/redis-storage.rb"
2
+
3
+ describe ExtraLoop::Storage::DatasetFactory do
4
+ Ohm.connect :url => "redis://127.0.0.1:6379/7"
5
+
6
+ describe "#get_class" do
7
+ context "with invalid input" do
8
+ before do
9
+ @factory = ExtraLoop::Storage::DatasetFactory.new(:blurb, [:a, :b, :c])
10
+ end
11
+
12
+ subject { @factory.get_class.new :a => 22, :b => 33, :c => 44 }
13
+
14
+ it { should respond_to :a }
15
+ it { should respond_to :b }
16
+ it { should respond_to :c }
17
+ it { should respond_to :save }
18
+ it { subject.valid?.should be_false }
19
+
20
+ after do
21
+ Object.send(:remove_const, :Blurb)
22
+ end
23
+ end
24
+
25
+ context "with valid input" do
26
+ before do
27
+ @factory = ExtraLoop::Storage::DatasetFactory.new(:blurb, [:a, :b, :c])
28
+ @session = ExtraLoop::Storage::ScrapingSession.create
29
+ end
30
+
31
+ subject { @factory.get_class.new :session => @session }
32
+ it { subject.valid?.should be_true }
33
+
34
+ end
35
+
36
+
37
+ after do
38
+ patterns = ["Mycollection", "ExtraLoop"]
39
+ patterns.each { |pattern| Ohm.redis.keys("*#{pattern}*").each { |key| Ohm.redis.del(key)} }
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,82 @@
1
+ load "../lib/extraloop/redis-storage.rb"
2
+
3
+ class MyRecord < ExtraLoop::Storage::Record
4
+ attribute :foo
5
+ attribute :bar
6
+ end
7
+
8
+
9
+ describe ExtraLoop::Storage::Record do
10
+
11
+ before do
12
+ @session = ExtraLoop::Storage::ScrapingSession.create
13
+ end
14
+
15
+ context "record subclasses" do
16
+
17
+ describe "#save" do
18
+
19
+ subject { MyRecord.new(:session => @session, :extracted_at => Time.now).save }
20
+
21
+ it { subject.extracted_at.should be_a_kind_of(Time) }
22
+ it { subject.session.should eql(@session) }
23
+
24
+ context "without a session attribute" do
25
+ subject { MyRecord.new }
26
+ it { subject.valid?.should_not be_true }
27
+ end
28
+ end
29
+
30
+ describe "#create" do
31
+ subject { MyRecord.create(:session => @session, :extracted_at => Time.now) }
32
+ it { subject.extracted_at.should be_a_kind_of(Time) }
33
+ it { subject.session.should eql(@session) }
34
+ end
35
+
36
+ describe "Record::last" do
37
+ before do
38
+ 2.times { MyRecord.create(:session => @session) }
39
+ end
40
+
41
+ subject { MyRecord }
42
+ it { should respond_to :last }
43
+ end
44
+
45
+ describe "#to_hash" do
46
+ before do
47
+ @record = MyRecord.create :foo => 'blurbzz', :bar => 'zzzzzz', :session => @session
48
+ end
49
+
50
+ subject { @record.to_hash }
51
+
52
+ it "should have converted its attributes as hash keys" do
53
+ [:session_id, :foo, :bar, :created_at, :updated_at].each { |key| subject.should have_key(key) and subject[key].should_not be_nil }
54
+ end
55
+
56
+ context "excluding attributes" do
57
+
58
+ before do
59
+ @user_class = Class.new(ExtraLoop::Storage::Record) do
60
+ attribute :password
61
+ attribute :username
62
+
63
+ def to_hash
64
+ super.reject { |attr| attr === :password }
65
+ end
66
+ end
67
+ end
68
+
69
+ subject { @user_class.create(:session => @session, :username => 'bob', :password => 'secret' ).to_hash }
70
+
71
+ it { should_not have_key(:password) }
72
+ it { should have_key(:username) }
73
+ it { should have_key(:session_id) }
74
+ end
75
+ end
76
+ end
77
+
78
+ after do
79
+ redis = Ohm.redis
80
+ redis.keys("[^art]*").each { |key| redis.del(key) }
81
+ end
82
+ end
@@ -0,0 +1,45 @@
1
+ load "../lib/extraloop/redis-storage.rb"
2
+
3
+
4
+ describe ExtraLoop::ScraperBase do
5
+ Ohm.connect :url => "redis://127.0.0.1:6379/7"
6
+
7
+ before(:each) do
8
+ @records = records = (1..10).to_a.map { |n| OpenStruct.new :foo => "foo#{n}" }
9
+ @scraper = ExtraLoop::ScraperBase.new("http://someurl.net").
10
+ loop_on("*").
11
+ extract(:foo).
12
+ extract(:bar)
13
+
14
+ env = ExtraLoop::ExtractionEnvironment.new(@scraper, nil, @records)
15
+
16
+ @scraper.define_singleton_method :run do
17
+ @environment = env
18
+ self.run_hook :data, [records]
19
+ end
20
+ end
21
+
22
+ describe "#set_storage" do
23
+ context "with no arguments but a block" do
24
+ before do
25
+ received_records = nil
26
+
27
+ @scraper.
28
+ set_storage { |records| received_records = records }.
29
+ run()
30
+
31
+ @received_records = received_records
32
+ end
33
+ it "all records should be openstruct instances" do
34
+ @received_records.all? { |record| record.is_a?(Extraloop_scraperbase_data) }.should be_true
35
+ end
36
+ end
37
+
38
+ context "with title argument and no block" do
39
+ before do
40
+ @scraper.set_storage "my dummy dataset"
41
+ end
42
+ end
43
+ end
44
+ end
45
+
@@ -0,0 +1,25 @@
1
+ $VERBOSE=nil
2
+ load "../lib/extraloop/redis-storage.rb"
3
+
4
+ describe ExtraLoop::Storage::ScrapingSession do
5
+ Ohm.connect :url => "redis://127.0.0.1:6379/7"
6
+
7
+ describe "#records" do
8
+ before(:each) do
9
+ my_collection = ExtraLoop::Storage::DatasetFactory.new(:MyCollection).get_class
10
+ @session = ExtraLoop::Storage::ScrapingSession.create
11
+ 5.times do
12
+ item = my_collection.create(:session => @session)
13
+ end
14
+ end
15
+
16
+ context "dataset class exists" do
17
+ context "passing a constant" do
18
+ subject { @session.records(Mycollection) }
19
+ it { should have(5).items }
20
+ it { subject.all? { |record| record.valid? }.should be_true }
21
+ end
22
+ end
23
+
24
+ end
25
+ end
metadata ADDED
@@ -0,0 +1,125 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: extraloop-redis-storage
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Andrea Fiore
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-02-20 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: extraloop
16
+ requirement: &16470780 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 0.0.3
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *16470780
25
+ - !ruby/object:Gem::Dependency
26
+ name: ohm
27
+ requirement: &16470320 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ version: 0.1.3
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *16470320
36
+ - !ruby/object:Gem::Dependency
37
+ name: ohm-contrib
38
+ requirement: &16469860 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ version: 0.1.2
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *16469860
47
+ - !ruby/object:Gem::Dependency
48
+ name: rspec
49
+ requirement: &16469400 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: 2.7.0
55
+ type: :development
56
+ prerelease: false
57
+ version_requirements: *16469400
58
+ - !ruby/object:Gem::Dependency
59
+ name: rr
60
+ requirement: &16468940 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ~>
64
+ - !ruby/object:Gem::Version
65
+ version: 1.0.4
66
+ type: :development
67
+ prerelease: false
68
+ version_requirements: *16468940
69
+ - !ruby/object:Gem::Dependency
70
+ name: pry
71
+ requirement: &16468480 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ~>
75
+ - !ruby/object:Gem::Version
76
+ version: 0.9.7.4
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: *16468480
80
+ description: Redis+Ohm based storage for data sets extracted using the ExtraLoop toolkit.
81
+ email: andrea.giulio.fiore@googlemail.com
82
+ executables: []
83
+ extensions: []
84
+ extra_rdoc_files: []
85
+ files:
86
+ - History.txt
87
+ - README.rdoc
88
+ - examples/amazon_reviews.rb
89
+ - examples/lib/amazon_review_scraper.rb
90
+ - lib/extraloop/redis-storage.rb
91
+ - lib/extraloop/redis-storage/dataset_factory.rb
92
+ - lib/extraloop/redis-storage/record.rb
93
+ - lib/extraloop/redis-storage/scraping_session.rb
94
+ - lib/extraloop/scraper_base.rb
95
+ - spec/dataset_factory_spec.rb
96
+ - spec/record_spec.rb
97
+ - spec/scraper_base_spec.rb
98
+ - spec/scraping_session_spec.rb
99
+ homepage: http://github.com/afiore/extraloop-redis-storage
100
+ licenses: []
101
+ post_install_message:
102
+ rdoc_options:
103
+ - --charset=UTF-8
104
+ require_paths:
105
+ - lib
106
+ required_ruby_version: !ruby/object:Gem::Requirement
107
+ none: false
108
+ requirements:
109
+ - - ! '>='
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ required_rubygems_version: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ requirements: []
119
+ rubyforge_project: extraloop-redis-storage
120
+ rubygems_version: 1.8.10
121
+ signing_key:
122
+ specification_version: 2
123
+ summary: Redis storage for Extraloop.
124
+ test_files: []
125
+ has_rdoc: