extraloop-redis-storage 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,7 +4,6 @@ require "date"
4
4
  require "./lib/amazon_review_scraper.rb"
5
5
  require "../lib/extraloop/redis-storage.rb"
6
6
 
7
-
8
7
  class AmazonReview < ExtraLoop::Storage::Record
9
8
  attribute :title
10
9
  attribute :rank
@@ -0,0 +1,17 @@
1
+ require "pry"
2
+ require "rubygems"
3
+ require "../lib/extraloop/redis-storage.rb"
4
+
5
+
6
+ google_news_scraper = ExtraLoop::IterativeScraper.new("https://www.google.com/search?tbm=nws&q=Egypt").
7
+ set_iteration(:start, (1..101).step(10)).
8
+ loop_on("h3") { |nodes| nodes.map(&:parent) }.
9
+ extract(:title, "h3.r a").
10
+ extract(:url, "h3.r a", :href).
11
+ extract(:source, "br") { |node| node.next.text.split("-").first }.
12
+ set_storage(:GoogleNewsStory).
13
+ run
14
+
15
+ puts "#{GoogleNewsStory.all.size} news stories fetched..."
16
+
17
+
@@ -3,8 +3,10 @@
3
3
 
4
4
  class ExtraLoop::Storage::DatasetFactory
5
5
  def initialize(classname, attributes=[])
6
- @classname = classname.to_s.capitalize
7
6
 
7
+ @classname = (classname.to_s.split "").each_with_index.map { |char, index| index == 0 && char.upcase or char }.join
8
+
9
+ return if Object.const_defined? @classname
8
10
  Object.const_set(@classname, Class.new(ExtraLoop::Storage::Record) {
9
11
  attributes.each { |attr| attribute attr }
10
12
  })
@@ -1,21 +1,14 @@
1
1
  class ExtraLoop::ScraperBase
2
2
  attr_reader :session
3
3
 
4
- def set_storage(*args)
5
- model = args.detect { |arg| arg.is_a?(Symbol) or arg.respond_to?(:new) }
6
- title = args.detect { |arg| arg.is_a?(String) }
7
-
8
- collection_name = self.class.to_s.gsub(/(:)+/,'_').downcase + "_data"
9
-
4
+ def set_storage(model, title=nil)
5
+ collection_name = "#{Time.now.to_i} #{model.to_s} Dataset"
10
6
  title ||= collection_name
11
- model ||= collection_name.to_sym
12
-
13
7
  log_session! title
14
8
 
15
9
  @model = model_klass = model.respond_to?(:new) && model || ExtraLoop::Storage::DatasetFactory.new(model.to_sym, @extractor_args.map(&:first)).get_class
16
10
 
17
11
  on :data do |results|
18
- # TODO: avoid calling send in the scraper object
19
12
  results = results.map { |result| @scraper.send(:instanciate_model, result) }
20
13
  block_given? && yield(results) || results.each { |result| result.save if result.respond_to?(:save) }
21
14
  end
@@ -20,25 +20,79 @@ describe ExtraLoop::ScraperBase do
20
20
  end
21
21
 
22
22
  describe "#set_storage" do
23
- context "with no arguments but a block" do
23
+ context "with a symbol as the 'model' parameter value" do
24
24
  before do
25
25
  received_records = nil
26
26
 
27
27
  @scraper.
28
- set_storage { |records| received_records = records }.
28
+ set_storage(:MyRecord) { |records| received_records = records }.
29
29
  run()
30
30
 
31
31
  @received_records = received_records
32
32
  end
33
- it "all records should be openstruct instances" do
34
- @received_records.all? { |record| record.is_a?(Extraloop_scraperbase_data) }.should be_true
33
+
34
+ it "Should dynamically create 'MyRecord' class" do
35
+ @received_records.all? { |record| record.is_a?(MyRecord) }.should be_true
36
+ end
37
+
38
+ it "All records should be associated to the same ScrapingSession object" do
39
+ @received_records.all? { |record| record.session.should be_eql @scraper.session }
40
+ end
41
+
42
+ it "Should auto assign a session title" do
43
+ @scraper.session.title.should match /^(\d)*\s(MyRecord)\sDataset$/
44
+ end
45
+ end
46
+
47
+ context "with a constant as the 'model' parameter value" do
48
+ before do
49
+ received_records = nil
50
+
51
+ class MyModel < ExtraLoop::Storage::Record
52
+ attribute :foo
53
+ end
54
+
55
+ @scraper.
56
+ set_storage(MyModel) { |records| received_records = records }.
57
+ run()
58
+
59
+ @received_records = received_records
60
+ end
61
+
62
+ it "All records should be instances of MyModel" do
63
+ @received_records.all? { |record| record.is_a?(MyModel) }.should be_true
64
+ end
65
+
66
+ it "All records should be associated to the same ScrapingSession object" do
67
+ @received_records.all? { |record| record.session.should be_eql @scraper.session }
68
+ end
69
+
70
+ it "Should auto assign a session title" do
71
+ @scraper.session.title.should match /^(\d)*\s(MyModel)\sDataset$/
35
72
  end
36
73
  end
74
+ end
37
75
 
38
- context "with title argument and no block" do
39
- before do
40
- @scraper.set_storage "my dummy dataset"
76
+ context "with a constant name and no block" do
77
+ before do
78
+ received_records = nil
79
+
80
+ if !Object.const_defined? :MyModel
81
+ class MyModel < ExtraLoop::Storage::Record
82
+ attribute :foo
83
+ end
41
84
  end
85
+
86
+ @scraper.
87
+ set_storage(MyModel).
88
+ run()
89
+
90
+ @received_records = received_records
91
+ end
92
+
93
+ it "should persist 10 records" do
94
+ (@scraper.session.records MyModel).should have(10).records
95
+ (@scraper.session.records MyModel).map(&:id).reject(&:nil?).should_not be_empty
42
96
  end
43
97
  end
44
98
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: extraloop-redis-storage
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-20 00:00:00.000000000Z
12
+ date: 2012-02-21 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: extraloop
16
- requirement: &16470780 !ruby/object:Gem::Requirement
16
+ requirement: &19118640 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 0.0.3
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *16470780
24
+ version_requirements: *19118640
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: ohm
27
- requirement: &16470320 !ruby/object:Gem::Requirement
27
+ requirement: &19118180 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 0.1.3
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *16470320
35
+ version_requirements: *19118180
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ohm-contrib
38
- requirement: &16469860 !ruby/object:Gem::Requirement
38
+ requirement: &19117720 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 0.1.2
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *16469860
46
+ version_requirements: *19117720
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rspec
49
- requirement: &16469400 !ruby/object:Gem::Requirement
49
+ requirement: &19117260 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: 2.7.0
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *16469400
57
+ version_requirements: *19117260
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rr
60
- requirement: &16468940 !ruby/object:Gem::Requirement
60
+ requirement: &19116800 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: 1.0.4
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *16468940
68
+ version_requirements: *19116800
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: pry
71
- requirement: &16468480 !ruby/object:Gem::Requirement
71
+ requirement: &19116340 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ~>
@@ -76,7 +76,7 @@ dependencies:
76
76
  version: 0.9.7.4
77
77
  type: :development
78
78
  prerelease: false
79
- version_requirements: *16468480
79
+ version_requirements: *19116340
80
80
  description: Redis+Ohm based storage for data sets extracted using the ExtraLoop toolkit.
81
81
  email: andrea.giulio.fiore@googlemail.com
82
82
  executables: []
@@ -86,6 +86,7 @@ files:
86
86
  - History.txt
87
87
  - README.rdoc
88
88
  - examples/amazon_reviews.rb
89
+ - examples/google_news_scraper.rb
89
90
  - examples/lib/amazon_review_scraper.rb
90
91
  - lib/extraloop/redis-storage.rb
91
92
  - lib/extraloop/redis-storage/dataset_factory.rb