extraloop-redis-storage 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/examples/amazon_reviews.rb
CHANGED
@@ -0,0 +1,17 @@
|
|
1
|
+
require "pry"
|
2
|
+
require "rubygems"
|
3
|
+
require "../lib/extraloop/redis-storage.rb"
|
4
|
+
|
5
|
+
|
6
|
+
google_news_scraper = ExtraLoop::IterativeScraper.new("https://www.google.com/search?tbm=nws&q=Egypt").
|
7
|
+
set_iteration(:start, (1..101).step(10)).
|
8
|
+
loop_on("h3") { |nodes| nodes.map(&:parent) }.
|
9
|
+
extract(:title, "h3.r a").
|
10
|
+
extract(:url, "h3.r a", :href).
|
11
|
+
extract(:source, "br") { |node| node.next.text.split("-").first }.
|
12
|
+
set_storage(:GoogleNewsStory).
|
13
|
+
run
|
14
|
+
|
15
|
+
puts "#{GoogleNewsStory.all.size} news stories fetched..."
|
16
|
+
|
17
|
+
|
@@ -3,8 +3,10 @@
|
|
3
3
|
|
4
4
|
class ExtraLoop::Storage::DatasetFactory
|
5
5
|
def initialize(classname, attributes=[])
|
6
|
-
@classname = classname.to_s.capitalize
|
7
6
|
|
7
|
+
@classname = (classname.to_s.split "").each_with_index.map { |char, index| index == 0 && char.upcase or char }.join
|
8
|
+
|
9
|
+
return if Object.const_defined? @classname
|
8
10
|
Object.const_set(@classname, Class.new(ExtraLoop::Storage::Record) {
|
9
11
|
attributes.each { |attr| attribute attr }
|
10
12
|
})
|
@@ -1,21 +1,14 @@
|
|
1
1
|
class ExtraLoop::ScraperBase
|
2
2
|
attr_reader :session
|
3
3
|
|
4
|
-
def set_storage(
|
5
|
-
|
6
|
-
title = args.detect { |arg| arg.is_a?(String) }
|
7
|
-
|
8
|
-
collection_name = self.class.to_s.gsub(/(:)+/,'_').downcase + "_data"
|
9
|
-
|
4
|
+
def set_storage(model, title=nil)
|
5
|
+
collection_name = "#{Time.now.to_i} #{model.to_s} Dataset"
|
10
6
|
title ||= collection_name
|
11
|
-
model ||= collection_name.to_sym
|
12
|
-
|
13
7
|
log_session! title
|
14
8
|
|
15
9
|
@model = model_klass = model.respond_to?(:new) && model || ExtraLoop::Storage::DatasetFactory.new(model.to_sym, @extractor_args.map(&:first)).get_class
|
16
10
|
|
17
11
|
on :data do |results|
|
18
|
-
# TODO: avoid calling send in the scraper object
|
19
12
|
results = results.map { |result| @scraper.send(:instanciate_model, result) }
|
20
13
|
block_given? && yield(results) || results.each { |result| result.save if result.respond_to?(:save) }
|
21
14
|
end
|
data/spec/scraper_base_spec.rb
CHANGED
@@ -20,25 +20,79 @@ describe ExtraLoop::ScraperBase do
|
|
20
20
|
end
|
21
21
|
|
22
22
|
describe "#set_storage" do
|
23
|
-
context "with
|
23
|
+
context "with a symbol as the 'model' parameter value" do
|
24
24
|
before do
|
25
25
|
received_records = nil
|
26
26
|
|
27
27
|
@scraper.
|
28
|
-
set_storage { |records| received_records = records }.
|
28
|
+
set_storage(:MyRecord) { |records| received_records = records }.
|
29
29
|
run()
|
30
30
|
|
31
31
|
@received_records = received_records
|
32
32
|
end
|
33
|
-
|
34
|
-
|
33
|
+
|
34
|
+
it "Should dynamically create 'MyRecord' class" do
|
35
|
+
@received_records.all? { |record| record.is_a?(MyRecord) }.should be_true
|
36
|
+
end
|
37
|
+
|
38
|
+
it "All records should be associated to the same ScrapingSession object" do
|
39
|
+
@received_records.all? { |record| record.session.should be_eql @scraper.session }
|
40
|
+
end
|
41
|
+
|
42
|
+
it "Should auto assign a session title" do
|
43
|
+
@scraper.session.title.should match /^(\d)*\s(MyRecord)\sDataset$/
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
context "with a constant as the 'model' parameter value" do
|
48
|
+
before do
|
49
|
+
received_records = nil
|
50
|
+
|
51
|
+
class MyModel < ExtraLoop::Storage::Record
|
52
|
+
attribute :foo
|
53
|
+
end
|
54
|
+
|
55
|
+
@scraper.
|
56
|
+
set_storage(MyModel) { |records| received_records = records }.
|
57
|
+
run()
|
58
|
+
|
59
|
+
@received_records = received_records
|
60
|
+
end
|
61
|
+
|
62
|
+
it "All records should be instances of MyModel" do
|
63
|
+
@received_records.all? { |record| record.is_a?(MyModel) }.should be_true
|
64
|
+
end
|
65
|
+
|
66
|
+
it "All records should be associated to the same ScrapingSession object" do
|
67
|
+
@received_records.all? { |record| record.session.should be_eql @scraper.session }
|
68
|
+
end
|
69
|
+
|
70
|
+
it "Should auto assign a session title" do
|
71
|
+
@scraper.session.title.should match /^(\d)*\s(MyModel)\sDataset$/
|
35
72
|
end
|
36
73
|
end
|
74
|
+
end
|
37
75
|
|
38
|
-
|
39
|
-
|
40
|
-
|
76
|
+
context "with a constant name and no block" do
|
77
|
+
before do
|
78
|
+
received_records = nil
|
79
|
+
|
80
|
+
if !Object.const_defined? :MyModel
|
81
|
+
class MyModel < ExtraLoop::Storage::Record
|
82
|
+
attribute :foo
|
83
|
+
end
|
41
84
|
end
|
85
|
+
|
86
|
+
@scraper.
|
87
|
+
set_storage(MyModel).
|
88
|
+
run()
|
89
|
+
|
90
|
+
@received_records = received_records
|
91
|
+
end
|
92
|
+
|
93
|
+
it "should persist 10 records" do
|
94
|
+
(@scraper.session.records MyModel).should have(10).records
|
95
|
+
(@scraper.session.records MyModel).map(&:id).reject(&:nil?).should_not be_empty
|
42
96
|
end
|
43
97
|
end
|
44
98
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: extraloop-redis-storage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-21 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: extraloop
|
16
|
-
requirement: &
|
16
|
+
requirement: &19118640 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 0.0.3
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *19118640
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: ohm
|
27
|
-
requirement: &
|
27
|
+
requirement: &19118180 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.1.3
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *19118180
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ohm-contrib
|
38
|
-
requirement: &
|
38
|
+
requirement: &19117720 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 0.1.2
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *19117720
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rspec
|
49
|
-
requirement: &
|
49
|
+
requirement: &19117260 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 2.7.0
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *19117260
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rr
|
60
|
-
requirement: &
|
60
|
+
requirement: &19116800 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: 1.0.4
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *19116800
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: pry
|
71
|
-
requirement: &
|
71
|
+
requirement: &19116340 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,7 +76,7 @@ dependencies:
|
|
76
76
|
version: 0.9.7.4
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *19116340
|
80
80
|
description: Redis+Ohm based storage for data sets extracted using the ExtraLoop toolkit.
|
81
81
|
email: andrea.giulio.fiore@googlemail.com
|
82
82
|
executables: []
|
@@ -86,6 +86,7 @@ files:
|
|
86
86
|
- History.txt
|
87
87
|
- README.rdoc
|
88
88
|
- examples/amazon_reviews.rb
|
89
|
+
- examples/google_news_scraper.rb
|
89
90
|
- examples/lib/amazon_review_scraper.rb
|
90
91
|
- lib/extraloop/redis-storage.rb
|
91
92
|
- lib/extraloop/redis-storage/dataset_factory.rb
|