extraloop-redis-storage 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +7 -11
- data/examples/amazon_reviews.rb +3 -12
- data/examples/google_news_scraper.rb +1 -1
- data/examples/lib/models/amazon_review.rb +9 -0
- data/examples/lib/{amazon_review_scraper.rb → scrapers/amazon_review_scraper.rb} +0 -0
- data/lib/extraloop/redis-storage.rb +10 -7
- data/lib/extraloop/redis-storage/dataset_factory.rb +2 -1
- data/lib/extraloop/redis-storage/model.rb +13 -0
- data/lib/extraloop/redis-storage/scraping_session.rb +42 -3
- data/lib/extraloop/scraper_base.rb +7 -2
- metadata +28 -15
data/README.rdoc
CHANGED
@@ -12,18 +12,18 @@ makes storing and managing Ruby objects in Redis easy and convenient.
|
|
12
12
|
|
13
13
|
== Usage
|
14
14
|
|
15
|
-
Extraloop's Redis storage module decorates
|
15
|
+
Extraloop's Redis storage module decorates <code>ExtraLoop::ScraperBase</code> and <code>ExtraLoop::IterativeScraper</code> instances
|
16
16
|
with the +set_storage+ method: a helper method that allows to specify how the scraped data should be stored.
|
17
17
|
|
18
18
|
require "extraloop/redis-storage"
|
19
19
|
|
20
20
|
class AmazonReview < ExtraLoop::Storage::Record
|
21
21
|
attribute :title
|
22
|
-
attribute :
|
23
|
-
attribute :
|
22
|
+
attribute :rank
|
23
|
+
attribute :date
|
24
24
|
|
25
25
|
def validate
|
26
|
-
(0..5).include
|
26
|
+
assert (0..5).include?(rank.to_i), "Rank not in range"
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
@@ -32,7 +32,7 @@ with the +set_storage+ method: a helper method that allows to specify how the sc
|
|
32
32
|
.run()
|
33
33
|
|
34
34
|
At each scraper run, the ExtraLoop storage module internally instantiates a
|
35
|
-
session (see
|
35
|
+
session (see <code>ExtraLoop::Storage::ScrapingSession</code>) and link the extracted records to it.
|
36
36
|
The +AmazonReview+ instances extracted and stored in the example above, can in fact be fetched by calling
|
37
37
|
Ohm's +find+ with the session id as argument.
|
38
38
|
|
@@ -48,10 +48,6 @@ session instance:
|
|
48
48
|
|
49
49
|
The +set_storage+ method can be called with the following arguments:
|
50
50
|
|
51
|
-
* _model_ A Ruby constant specifying the model to be used for storing the extracted data
|
52
|
-
* _session_title_ A human readable
|
53
|
-
|
54
|
-
== Running the test suite
|
55
|
-
|
56
|
-
The test suite can be run by executing the +rspec\ \*+ command from within the +spec/+ directory.
|
51
|
+
* _model_ A Ruby constant specifying the model to be used for storing the extracted data .
|
52
|
+
* _session_title_ A human readable title for the extracted dataset (optional).
|
57
53
|
|
data/examples/amazon_reviews.rb
CHANGED
@@ -1,18 +1,9 @@
|
|
1
1
|
require "rubygems"
|
2
|
-
require "extraloop"
|
3
2
|
require "date"
|
4
|
-
require "
|
3
|
+
require "extraloop"
|
5
4
|
require "../lib/extraloop/redis-storage.rb"
|
6
|
-
|
7
|
-
|
8
|
-
attribute :title
|
9
|
-
attribute :rank
|
10
|
-
attribute :date
|
11
|
-
|
12
|
-
def validate
|
13
|
-
assert (0..5).include?(rank.to_i), "Rank not in range"
|
14
|
-
end
|
15
|
-
end
|
5
|
+
require "./lib/models/amazon_review.rb"
|
6
|
+
require "./lib/scrapers/amazon_review_scraper.rb"
|
16
7
|
|
17
8
|
scraper = AmazonReviewScraper.new("0262560992").
|
18
9
|
set_storage(AmazonReview).
|
File without changes
|
@@ -1,30 +1,33 @@
|
|
1
1
|
require "json"
|
2
2
|
require "rubygems"
|
3
3
|
require "redis"
|
4
|
-
require 'pry'
|
5
4
|
require "ohm"
|
6
5
|
require "ohm/contrib"
|
7
6
|
require "extraloop"
|
8
7
|
|
9
8
|
base_path = File.realpath(File.dirname(__FILE__))
|
10
9
|
$: << "#{base_path}"
|
11
|
-
require "scraper_base"
|
12
|
-
|
13
10
|
|
11
|
+
require "scraper_base"
|
14
12
|
|
15
13
|
module ExtraLoop
|
16
14
|
module Storage
|
17
15
|
VERSION ||= "0.0.1"
|
18
16
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
17
|
+
def self.connect(*args)
|
18
|
+
Ohm.connect(*args)
|
19
|
+
end
|
20
|
+
|
21
|
+
# Tries to automatically locate the models directory and load all ruby files within in
|
22
|
+
def self.autoload_models(dirname='models')
|
23
|
+
Dir["**/**#{dirname}/*.rb"].each { |path| require "./#{path}" }
|
23
24
|
end
|
24
25
|
end
|
25
26
|
end
|
26
27
|
|
27
28
|
ExtraLoop::Storage.autoload :Record, "#{base_path}/redis-storage/record.rb"
|
28
29
|
ExtraLoop::Storage.autoload :ScrapingSession, "#{base_path}/redis-storage/scraping_session.rb"
|
30
|
+
ExtraLoop::Storage.autoload :Model, "#{base_path}/redis-storage/model.rb"
|
29
31
|
ExtraLoop::Storage.autoload :DatasetFactory, "#{base_path}/redis-storage/dataset_factory.rb"
|
30
32
|
|
33
|
+
|
@@ -6,7 +6,8 @@ class ExtraLoop::Storage::DatasetFactory
|
|
6
6
|
|
7
7
|
@classname = (classname.to_s.split "").each_with_index.map { |char, index| index == 0 && char.upcase or char }.join
|
8
8
|
|
9
|
-
return if Object.const_defined? @classname
|
9
|
+
return Object.const_get @classname if Object.const_defined? @classname
|
10
|
+
|
10
11
|
Object.const_set(@classname, Class.new(ExtraLoop::Storage::Record) {
|
11
12
|
attributes.each { |attr| attribute attr }
|
12
13
|
})
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# Meta model designed to keep track of what types of records
|
2
|
+
# are associated to a Scraping session object.
|
3
|
+
#
|
4
|
+
class ExtraLoop::Storage::Model < Ohm::Model
|
5
|
+
attribute :name
|
6
|
+
index :name
|
7
|
+
|
8
|
+
def to_hash
|
9
|
+
super.merge(attributes.reduce({}) { |memo, attribute|
|
10
|
+
memo.merge(attribute => send(attribute))
|
11
|
+
})
|
12
|
+
end
|
13
|
+
end
|
@@ -2,11 +2,50 @@ class ExtraLoop::Storage::ScrapingSession < Ohm::Model
|
|
2
2
|
|
3
3
|
include Ohm::Boundaries
|
4
4
|
include Ohm::Timestamping
|
5
|
+
include Ohm::Callbacks
|
5
6
|
|
6
7
|
attribute :title
|
8
|
+
reference :model, ExtraLoop::Storage::Model
|
7
9
|
|
8
|
-
def records(
|
9
|
-
|
10
|
-
|
10
|
+
def records(params={})
|
11
|
+
klass = if Object.const_defined?(model.name)
|
12
|
+
Object.const_get(model.name)
|
13
|
+
else
|
14
|
+
dynamic_class = Class.new(ExtraLoop::Storage::Record) do
|
15
|
+
# override default to_hash so that it will return the Redis hash
|
16
|
+
# internally stored by Ohm
|
17
|
+
def to_hash
|
18
|
+
Ohm.redis.hgetall self.key
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
Object.const_set(model.name, dynamic_class)
|
23
|
+
dynamic_class
|
24
|
+
end
|
25
|
+
|
26
|
+
# set a session index, so that Ohm finder will work
|
27
|
+
klass.indices << :session_id unless klass.indices.include? :session_id
|
28
|
+
|
29
|
+
klass.find({
|
30
|
+
:session_id => self.id
|
31
|
+
}.merge(params))
|
32
|
+
end
|
33
|
+
|
34
|
+
def validate
|
35
|
+
assert_present :model
|
36
|
+
end
|
37
|
+
|
38
|
+
def to_hash
|
39
|
+
attrs = attributes.reduce({}) { |memo, attribute|
|
40
|
+
memo.merge(attribute => send(attribute))
|
41
|
+
}.merge({
|
42
|
+
:records => records.map(&:to_hash),
|
43
|
+
:model => model.to_hash
|
44
|
+
})
|
45
|
+
|
46
|
+
super.merge attrs
|
47
|
+
end
|
48
|
+
|
49
|
+
def to_csv
|
11
50
|
end
|
12
51
|
end
|
@@ -4,9 +4,9 @@ class ExtraLoop::ScraperBase
|
|
4
4
|
def set_storage(model, title=nil)
|
5
5
|
collection_name = "#{Time.now.to_i} #{model.to_s} Dataset"
|
6
6
|
title ||= collection_name
|
7
|
-
log_session! title
|
8
7
|
|
9
8
|
@model = model_klass = model.respond_to?(:new) && model || ExtraLoop::Storage::DatasetFactory.new(model.to_sym, @extractor_args.map(&:first)).get_class
|
9
|
+
log_session! title
|
10
10
|
|
11
11
|
on :data do |results|
|
12
12
|
results = results.map { |result| @scraper.send(:instanciate_model, result) }
|
@@ -17,7 +17,12 @@ class ExtraLoop::ScraperBase
|
|
17
17
|
protected
|
18
18
|
# Creates a scraping session
|
19
19
|
def log_session!(title="")
|
20
|
-
|
20
|
+
if !@session
|
21
|
+
ns = ExtraLoop::Storage
|
22
|
+
results = ns::Model.find :name => @model
|
23
|
+
model = results.any? && results.first || ns::Model.create(:name => @model)
|
24
|
+
@session = ns::ScrapingSession.create :title => title, :model => model
|
25
|
+
end
|
21
26
|
end
|
22
27
|
|
23
28
|
# Converts extracted records into instances of the dataset model specified as the first argument
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: extraloop-redis-storage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-26 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: extraloop
|
16
|
-
requirement: &
|
16
|
+
requirement: &18201600 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 0.0.3
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *18201600
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: ohm
|
27
|
-
requirement: &
|
27
|
+
requirement: &18201140 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.1.3
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *18201140
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ohm-contrib
|
38
|
-
requirement: &
|
38
|
+
requirement: &18200680 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,21 @@ dependencies:
|
|
43
43
|
version: 0.1.2
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *18200680
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: thor
|
49
|
+
requirement: &18200080 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - =
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.14.6
|
55
|
+
type: :runtime
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *18200080
|
47
58
|
- !ruby/object:Gem::Dependency
|
48
59
|
name: rspec
|
49
|
-
requirement: &
|
60
|
+
requirement: &18199420 !ruby/object:Gem::Requirement
|
50
61
|
none: false
|
51
62
|
requirements:
|
52
63
|
- - ~>
|
@@ -54,10 +65,10 @@ dependencies:
|
|
54
65
|
version: 2.7.0
|
55
66
|
type: :development
|
56
67
|
prerelease: false
|
57
|
-
version_requirements: *
|
68
|
+
version_requirements: *18199420
|
58
69
|
- !ruby/object:Gem::Dependency
|
59
70
|
name: rr
|
60
|
-
requirement: &
|
71
|
+
requirement: &18198820 !ruby/object:Gem::Requirement
|
61
72
|
none: false
|
62
73
|
requirements:
|
63
74
|
- - ~>
|
@@ -65,10 +76,10 @@ dependencies:
|
|
65
76
|
version: 1.0.4
|
66
77
|
type: :development
|
67
78
|
prerelease: false
|
68
|
-
version_requirements: *
|
79
|
+
version_requirements: *18198820
|
69
80
|
- !ruby/object:Gem::Dependency
|
70
81
|
name: pry
|
71
|
-
requirement: &
|
82
|
+
requirement: &18187500 !ruby/object:Gem::Requirement
|
72
83
|
none: false
|
73
84
|
requirements:
|
74
85
|
- - ~>
|
@@ -76,7 +87,7 @@ dependencies:
|
|
76
87
|
version: 0.9.7.4
|
77
88
|
type: :development
|
78
89
|
prerelease: false
|
79
|
-
version_requirements: *
|
90
|
+
version_requirements: *18187500
|
80
91
|
description: Redis+Ohm based storage for data sets extracted using the ExtraLoop toolkit.
|
81
92
|
email: andrea.giulio.fiore@googlemail.com
|
82
93
|
executables: []
|
@@ -87,9 +98,11 @@ files:
|
|
87
98
|
- README.rdoc
|
88
99
|
- examples/amazon_reviews.rb
|
89
100
|
- examples/google_news_scraper.rb
|
90
|
-
- examples/lib/
|
101
|
+
- examples/lib/models/amazon_review.rb
|
102
|
+
- examples/lib/scrapers/amazon_review_scraper.rb
|
91
103
|
- lib/extraloop/redis-storage.rb
|
92
104
|
- lib/extraloop/redis-storage/dataset_factory.rb
|
105
|
+
- lib/extraloop/redis-storage/model.rb
|
93
106
|
- lib/extraloop/redis-storage/record.rb
|
94
107
|
- lib/extraloop/redis-storage/scraping_session.rb
|
95
108
|
- lib/extraloop/scraper_base.rb
|