extraloop-redis-storage 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,6 @@ require "date"
4
4
  require "./lib/amazon_review_scraper.rb"
5
5
  require "../lib/extraloop/redis-storage.rb"
6
6
 
7
-
8
7
  class AmazonReview < ExtraLoop::Storage::Record
9
8
  attribute :title
10
9
  attribute :rank
@@ -0,0 +1,17 @@
1
+ require "pry"
2
+ require "rubygems"
3
+ require "../lib/extraloop/redis-storage.rb"
4
+
5
+
6
+ google_news_scraper = ExtraLoop::IterativeScraper.new("https://www.google.com/search?tbm=nws&q=Egypt").
7
+ set_iteration(:start, (1..101).step(10)).
8
+ loop_on("h3") { |nodes| nodes.map(&:parent) }.
9
+ extract(:title, "h3.r a").
10
+ extract(:url, "h3.r a", :href).
11
+ extract(:source, "br") { |node| node.next.text.split("-").first }.
12
+ set_storage(:GoogleNewsStory).
13
+ run
14
+
15
+ puts "#{GoogleNewsStory.all.size} news stories fetched..."
16
+
17
+
@@ -3,8 +3,10 @@
3
3
 
4
4
  class ExtraLoop::Storage::DatasetFactory
5
5
  def initialize(classname, attributes=[])
6
- @classname = classname.to_s.capitalize
7
6
 
7
+ @classname = (classname.to_s.split "").each_with_index.map { |char, index| index == 0 && char.upcase or char }.join
8
+
9
+ return if Object.const_defined? @classname
8
10
  Object.const_set(@classname, Class.new(ExtraLoop::Storage::Record) {
9
11
  attributes.each { |attr| attribute attr }
10
12
  })
@@ -1,21 +1,14 @@
1
1
  class ExtraLoop::ScraperBase
2
2
  attr_reader :session
3
3
 
4
- def set_storage(*args)
5
- model = args.detect { |arg| arg.is_a?(Symbol) or arg.respond_to?(:new) }
6
- title = args.detect { |arg| arg.is_a?(String) }
7
-
8
- collection_name = self.class.to_s.gsub(/(:)+/,'_').downcase + "_data"
9
-
4
+ def set_storage(model, title=nil)
5
+ collection_name = "#{Time.now.to_i} #{model.to_s} Dataset"
10
6
  title ||= collection_name
11
- model ||= collection_name.to_sym
12
-
13
7
  log_session! title
14
8
 
15
9
  @model = model_klass = model.respond_to?(:new) && model || ExtraLoop::Storage::DatasetFactory.new(model.to_sym, @extractor_args.map(&:first)).get_class
16
10
 
17
11
  on :data do |results|
18
- # TODO: avoid calling send in the scraper object
19
12
  results = results.map { |result| @scraper.send(:instanciate_model, result) }
20
13
  block_given? && yield(results) || results.each { |result| result.save if result.respond_to?(:save) }
21
14
  end
@@ -20,25 +20,79 @@ describe ExtraLoop::ScraperBase do
20
20
  end
21
21
 
22
22
  describe "#set_storage" do
23
- context "with no arguments but a block" do
23
+ context "with a symbol as the 'model' parameter value" do
24
24
  before do
25
25
  received_records = nil
26
26
 
27
27
  @scraper.
28
- set_storage { |records| received_records = records }.
28
+ set_storage(:MyRecord) { |records| received_records = records }.
29
29
  run()
30
30
 
31
31
  @received_records = received_records
32
32
  end
33
- it "all records should be openstruct instances" do
34
- @received_records.all? { |record| record.is_a?(Extraloop_scraperbase_data) }.should be_true
33
+
34
+ it "Should dynamically create 'MyRecord' class" do
35
+ @received_records.all? { |record| record.is_a?(MyRecord) }.should be_true
36
+ end
37
+
38
+ it "All records should be associated to the same ScrapingSession object" do
39
+ @received_records.all? { |record| record.session.should be_eql @scraper.session }
40
+ end
41
+
42
+ it "Should auto assign a session title" do
43
+ @scraper.session.title.should match /^(\d)*\s(MyRecord)\sDataset$/
44
+ end
45
+ end
46
+
47
+ context "with a constant as the 'model' parameter value" do
48
+ before do
49
+ received_records = nil
50
+
51
+ class MyModel < ExtraLoop::Storage::Record
52
+ attribute :foo
53
+ end
54
+
55
+ @scraper.
56
+ set_storage(MyModel) { |records| received_records = records }.
57
+ run()
58
+
59
+ @received_records = received_records
60
+ end
61
+
62
+ it "All records should be instances of MyModel" do
63
+ @received_records.all? { |record| record.is_a?(MyModel) }.should be_true
64
+ end
65
+
66
+ it "All records should be associated to the same ScrapingSession object" do
67
+ @received_records.all? { |record| record.session.should be_eql @scraper.session }
68
+ end
69
+
70
+ it "Should auto assign a session title" do
71
+ @scraper.session.title.should match /^(\d)*\s(MyModel)\sDataset$/
35
72
  end
36
73
  end
74
+ end
37
75
 
38
- context "with title argument and no block" do
39
- before do
40
- @scraper.set_storage "my dummy dataset"
76
+ context "with a constant name and no block" do
77
+ before do
78
+ received_records = nil
79
+
80
+ if !Object.const_defined? :MyModel
81
+ class MyModel < ExtraLoop::Storage::Record
82
+ attribute :foo
83
+ end
41
84
  end
85
+
86
+ @scraper.
87
+ set_storage(MyModel).
88
+ run()
89
+
90
+ @received_records = received_records
91
+ end
92
+
93
+ it "should persist 10 records" do
94
+ (@scraper.session.records MyModel).should have(10).records
95
+ (@scraper.session.records MyModel).map(&:id).reject(&:nil?).should_not be_empty
42
96
  end
43
97
  end
44
98
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: extraloop-redis-storage
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-20 00:00:00.000000000Z
12
+ date: 2012-02-21 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: extraloop
16
- requirement: &16470780 !ruby/object:Gem::Requirement
16
+ requirement: &19118640 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 0.0.3
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *16470780
24
+ version_requirements: *19118640
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: ohm
27
- requirement: &16470320 !ruby/object:Gem::Requirement
27
+ requirement: &19118180 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 0.1.3
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *16470320
35
+ version_requirements: *19118180
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ohm-contrib
38
- requirement: &16469860 !ruby/object:Gem::Requirement
38
+ requirement: &19117720 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 0.1.2
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *16469860
46
+ version_requirements: *19117720
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rspec
49
- requirement: &16469400 !ruby/object:Gem::Requirement
49
+ requirement: &19117260 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: 2.7.0
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *16469400
57
+ version_requirements: *19117260
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rr
60
- requirement: &16468940 !ruby/object:Gem::Requirement
60
+ requirement: &19116800 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: 1.0.4
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *16468940
68
+ version_requirements: *19116800
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: pry
71
- requirement: &16468480 !ruby/object:Gem::Requirement
71
+ requirement: &19116340 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ~>
@@ -76,7 +76,7 @@ dependencies:
76
76
  version: 0.9.7.4
77
77
  type: :development
78
78
  prerelease: false
79
- version_requirements: *16468480
79
+ version_requirements: *19116340
80
80
  description: Redis+Ohm based storage for data sets extracted using the ExtraLoop toolkit.
81
81
  email: andrea.giulio.fiore@googlemail.com
82
82
  executables: []
@@ -86,6 +86,7 @@ files:
86
86
  - History.txt
87
87
  - README.rdoc
88
88
  - examples/amazon_reviews.rb
89
+ - examples/google_news_scraper.rb
89
90
  - examples/lib/amazon_review_scraper.rb
90
91
  - lib/extraloop/redis-storage.rb
91
92
  - lib/extraloop/redis-storage/dataset_factory.rb