extraloop-redis-storage 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -12,18 +12,18 @@ makes storing and managing Ruby objects in Redis easy and convenient.
12
12
 
13
13
  == Usage
14
14
 
15
- Extraloop's Redis storage module decorates +ExtraLoop::ScraperBase+ and +ExtraLoop::IterativeScraper+ instances
15
+ Extraloop's Redis storage module decorates <code>ExtraLoop::ScraperBase</code> and <code>ExtraLoop::IterativeScraper</code> instances
16
16
  with the +set_storage+ method: a helper method that allows to specify how the scraped data should be stored.
17
17
 
18
18
  require "extraloop/redis-storage"
19
19
 
20
20
  class AmazonReview < ExtraLoop::Storage::Record
21
21
  attribute :title
22
- attribute :author
23
- attribute :star
22
+ attribute :rank
23
+ attribute :date
24
24
 
25
25
  def validate
26
- (0..5).include star.to_i or && false
26
+ assert (0..5).include?(rank.to_i), "Rank not in range"
27
27
  end
28
28
  end
29
29
 
@@ -32,7 +32,7 @@ with the +set_storage+ method: a helper method that allows to specify how the sc
32
32
  .run()
33
33
 
34
34
  At each scraper run, the ExtraLoop storage module internally instantiates a
35
- session (see +ExtraLoop::Storage::ScrapingSession+) and link the extracted records to it.
35
+ session (see <code>ExtraLoop::Storage::ScrapingSession</code>) and link the extracted records to it.
36
36
  The +AmazonReview+ instances extracted and stored in the example above, can in fact be fetched by calling
37
37
  Ohm's +find+ with the session id as argument.
38
38
 
@@ -48,10 +48,6 @@ session instance:
48
48
 
49
49
  The +set_storage+ method can be called with the following arguments:
50
50
 
51
- * _model_ A Ruby constant specifying the model to be used for storing the extracted data (optional; when this argument is not provided, a simple-non validating model will be generated on the fly).
52
- * _session_title_ A human readable name for the extracted dataset.
53
-
54
- == Running the test suite
55
-
56
- The test suite can be run by executing the +rspec\ \*+ command from within the +spec/+ directory.
51
+ * _model_ A Ruby constant specifying the model to be used for storing the extracted data .
52
+ * _session_title_ A human readable title for the extracted dataset (optional).
57
53
 
@@ -1,18 +1,9 @@
1
1
  require "rubygems"
2
- require "extraloop"
3
2
  require "date"
4
- require "./lib/amazon_review_scraper.rb"
3
+ require "extraloop"
5
4
  require "../lib/extraloop/redis-storage.rb"
6
-
7
- class AmazonReview < ExtraLoop::Storage::Record
8
- attribute :title
9
- attribute :rank
10
- attribute :date
11
-
12
- def validate
13
- assert (0..5).include?(rank.to_i), "Rank not in range"
14
- end
15
- end
5
+ require "./lib/models/amazon_review.rb"
6
+ require "./lib/scrapers/amazon_review_scraper.rb"
16
7
 
17
8
  scraper = AmazonReviewScraper.new("0262560992").
18
9
  set_storage(AmazonReview).
@@ -12,6 +12,6 @@ google_news_scraper = ExtraLoop::IterativeScraper.new("https://www.google.com/se
12
12
  set_storage(:GoogleNewsStory).
13
13
  run
14
14
 
15
- puts "#{GoogleNewsStory.all.size} news stories fetched..."
15
+ puts "#{GoogleNewsStory.all.to_a.size} news stories fetched..."
16
16
 
17
17
 
@@ -0,0 +1,9 @@
1
+ class AmazonReview < ExtraLoop::Storage::Record
2
+ attribute :title
3
+ attribute :rank
4
+ attribute :date
5
+
6
+ def validate
7
+ assert (0..5).include?(rank.to_i), "Rank not in range"
8
+ end
9
+ end
@@ -1,30 +1,33 @@
1
1
  require "json"
2
2
  require "rubygems"
3
3
  require "redis"
4
- require 'pry'
5
4
  require "ohm"
6
5
  require "ohm/contrib"
7
6
  require "extraloop"
8
7
 
9
8
  base_path = File.realpath(File.dirname(__FILE__))
10
9
  $: << "#{base_path}"
11
- require "scraper_base"
12
-
13
10
 
11
+ require "scraper_base"
14
12
 
15
13
  module ExtraLoop
16
14
  module Storage
17
15
  VERSION ||= "0.0.1"
18
16
 
19
- class << self
20
- def connect(*args)
21
- Ohm.connect(*args)
22
- end
17
+ def self.connect(*args)
18
+ Ohm.connect(*args)
19
+ end
20
+
21
+ # Tries to automatically locate the models directory and load all ruby files within in
22
+ def self.autoload_models(dirname='models')
23
+ Dir["**/**#{dirname}/*.rb"].each { |path| require "./#{path}" }
23
24
  end
24
25
  end
25
26
  end
26
27
 
27
28
  ExtraLoop::Storage.autoload :Record, "#{base_path}/redis-storage/record.rb"
28
29
  ExtraLoop::Storage.autoload :ScrapingSession, "#{base_path}/redis-storage/scraping_session.rb"
30
+ ExtraLoop::Storage.autoload :Model, "#{base_path}/redis-storage/model.rb"
29
31
  ExtraLoop::Storage.autoload :DatasetFactory, "#{base_path}/redis-storage/dataset_factory.rb"
30
32
 
33
+
@@ -6,7 +6,8 @@ class ExtraLoop::Storage::DatasetFactory
6
6
 
7
7
  @classname = (classname.to_s.split "").each_with_index.map { |char, index| index == 0 && char.upcase or char }.join
8
8
 
9
- return if Object.const_defined? @classname
9
+ return Object.const_get @classname if Object.const_defined? @classname
10
+
10
11
  Object.const_set(@classname, Class.new(ExtraLoop::Storage::Record) {
11
12
  attributes.each { |attr| attribute attr }
12
13
  })
@@ -0,0 +1,13 @@
1
+ # Meta model designed to keep track of what types of records
2
+ # are associated to a Scraping session object.
3
+ #
4
+ class ExtraLoop::Storage::Model < Ohm::Model
5
+ attribute :name
6
+ index :name
7
+
8
+ def to_hash
9
+ super.merge(attributes.reduce({}) { |memo, attribute|
10
+ memo.merge(attribute => send(attribute))
11
+ })
12
+ end
13
+ end
@@ -2,11 +2,50 @@ class ExtraLoop::Storage::ScrapingSession < Ohm::Model
2
2
 
3
3
  include Ohm::Boundaries
4
4
  include Ohm::Timestamping
5
+ include Ohm::Callbacks
5
6
 
6
7
  attribute :title
8
+ reference :model, ExtraLoop::Storage::Model
7
9
 
8
- def records(collection)
9
- Kernel.const_get(collection.to_s).
10
- find(:session_id => self.id)
10
+ def records(params={})
11
+ klass = if Object.const_defined?(model.name)
12
+ Object.const_get(model.name)
13
+ else
14
+ dynamic_class = Class.new(ExtraLoop::Storage::Record) do
15
+ # override default to_hash so that it will return the Redis hash
16
+ # internally stored by Ohm
17
+ def to_hash
18
+ Ohm.redis.hgetall self.key
19
+ end
20
+ end
21
+
22
+ Object.const_set(model.name, dynamic_class)
23
+ dynamic_class
24
+ end
25
+
26
+ # set a session index, so that Ohm finder will work
27
+ klass.indices << :session_id unless klass.indices.include? :session_id
28
+
29
+ klass.find({
30
+ :session_id => self.id
31
+ }.merge(params))
32
+ end
33
+
34
+ def validate
35
+ assert_present :model
36
+ end
37
+
38
+ def to_hash
39
+ attrs = attributes.reduce({}) { |memo, attribute|
40
+ memo.merge(attribute => send(attribute))
41
+ }.merge({
42
+ :records => records.map(&:to_hash),
43
+ :model => model.to_hash
44
+ })
45
+
46
+ super.merge attrs
47
+ end
48
+
49
+ def to_csv
11
50
  end
12
51
  end
@@ -4,9 +4,9 @@ class ExtraLoop::ScraperBase
4
4
  def set_storage(model, title=nil)
5
5
  collection_name = "#{Time.now.to_i} #{model.to_s} Dataset"
6
6
  title ||= collection_name
7
- log_session! title
8
7
 
9
8
  @model = model_klass = model.respond_to?(:new) && model || ExtraLoop::Storage::DatasetFactory.new(model.to_sym, @extractor_args.map(&:first)).get_class
9
+ log_session! title
10
10
 
11
11
  on :data do |results|
12
12
  results = results.map { |result| @scraper.send(:instanciate_model, result) }
@@ -17,7 +17,12 @@ class ExtraLoop::ScraperBase
17
17
  protected
18
18
  # Creates a scraping session
19
19
  def log_session!(title="")
20
- @session ||= ExtraLoop::Storage::ScrapingSession.create :title => title
20
+ if !@session
21
+ ns = ExtraLoop::Storage
22
+ results = ns::Model.find :name => @model
23
+ model = results.any? && results.first || ns::Model.create(:name => @model)
24
+ @session = ns::ScrapingSession.create :title => title, :model => model
25
+ end
21
26
  end
22
27
 
23
28
  # Converts extracted records into instances of the dataset model specified as the first argument
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: extraloop-redis-storage
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-21 00:00:00.000000000Z
12
+ date: 2012-02-26 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: extraloop
16
- requirement: &19118640 !ruby/object:Gem::Requirement
16
+ requirement: &18201600 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 0.0.3
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *19118640
24
+ version_requirements: *18201600
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: ohm
27
- requirement: &19118180 !ruby/object:Gem::Requirement
27
+ requirement: &18201140 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 0.1.3
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *19118180
35
+ version_requirements: *18201140
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ohm-contrib
38
- requirement: &19117720 !ruby/object:Gem::Requirement
38
+ requirement: &18200680 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,21 @@ dependencies:
43
43
  version: 0.1.2
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *19117720
46
+ version_requirements: *18200680
47
+ - !ruby/object:Gem::Dependency
48
+ name: thor
49
+ requirement: &18200080 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - =
53
+ - !ruby/object:Gem::Version
54
+ version: 0.14.6
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *18200080
47
58
  - !ruby/object:Gem::Dependency
48
59
  name: rspec
49
- requirement: &19117260 !ruby/object:Gem::Requirement
60
+ requirement: &18199420 !ruby/object:Gem::Requirement
50
61
  none: false
51
62
  requirements:
52
63
  - - ~>
@@ -54,10 +65,10 @@ dependencies:
54
65
  version: 2.7.0
55
66
  type: :development
56
67
  prerelease: false
57
- version_requirements: *19117260
68
+ version_requirements: *18199420
58
69
  - !ruby/object:Gem::Dependency
59
70
  name: rr
60
- requirement: &19116800 !ruby/object:Gem::Requirement
71
+ requirement: &18198820 !ruby/object:Gem::Requirement
61
72
  none: false
62
73
  requirements:
63
74
  - - ~>
@@ -65,10 +76,10 @@ dependencies:
65
76
  version: 1.0.4
66
77
  type: :development
67
78
  prerelease: false
68
- version_requirements: *19116800
79
+ version_requirements: *18198820
69
80
  - !ruby/object:Gem::Dependency
70
81
  name: pry
71
- requirement: &19116340 !ruby/object:Gem::Requirement
82
+ requirement: &18187500 !ruby/object:Gem::Requirement
72
83
  none: false
73
84
  requirements:
74
85
  - - ~>
@@ -76,7 +87,7 @@ dependencies:
76
87
  version: 0.9.7.4
77
88
  type: :development
78
89
  prerelease: false
79
- version_requirements: *19116340
90
+ version_requirements: *18187500
80
91
  description: Redis+Ohm based storage for data sets extracted using the ExtraLoop toolkit.
81
92
  email: andrea.giulio.fiore@googlemail.com
82
93
  executables: []
@@ -87,9 +98,11 @@ files:
87
98
  - README.rdoc
88
99
  - examples/amazon_reviews.rb
89
100
  - examples/google_news_scraper.rb
90
- - examples/lib/amazon_review_scraper.rb
101
+ - examples/lib/models/amazon_review.rb
102
+ - examples/lib/scrapers/amazon_review_scraper.rb
91
103
  - lib/extraloop/redis-storage.rb
92
104
  - lib/extraloop/redis-storage/dataset_factory.rb
105
+ - lib/extraloop/redis-storage/model.rb
93
106
  - lib/extraloop/redis-storage/record.rb
94
107
  - lib/extraloop/redis-storage/scraping_session.rb
95
108
  - lib/extraloop/scraper_base.rb