extraloop-redis-storage 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -12,18 +12,18 @@ makes storing and managing Ruby objects in Redis easy and convenient.
12
12
 
13
13
  == Usage
14
14
 
15
- Extraloop's Redis storage module decorates +ExtraLoop::ScraperBase+ and +ExtraLoop::IterativeScraper+ instances
15
+ Extraloop's Redis storage module decorates <code>ExtraLoop::ScraperBase</code> and <code>ExtraLoop::IterativeScraper</code> instances
16
16
  with the +set_storage+ method: a helper method that allows to specify how the scraped data should be stored.
17
17
 
18
18
  require "extraloop/redis-storage"
19
19
 
20
20
  class AmazonReview < ExtraLoop::Storage::Record
21
21
  attribute :title
22
- attribute :author
23
- attribute :star
22
+ attribute :rank
23
+ attribute :date
24
24
 
25
25
  def validate
26
- (0..5).include star.to_i or && false
26
+ assert (0..5).include?(rank.to_i), "Rank not in range"
27
27
  end
28
28
  end
29
29
 
@@ -32,7 +32,7 @@ with the +set_storage+ method: a helper method that allows to specify how the sc
32
32
  .run()
33
33
 
34
34
  At each scraper run, the ExtraLoop storage module internally instantiates a
35
- session (see +ExtraLoop::Storage::ScrapingSession+) and link the extracted records to it.
35
+ session (see <code>ExtraLoop::Storage::ScrapingSession</code>) and link the extracted records to it.
36
36
  The +AmazonReview+ instances extracted and stored in the example above, can in fact be fetched by calling
37
37
  Ohm's +find+ with the session id as argument.
38
38
 
@@ -48,10 +48,6 @@ session instance:
48
48
 
49
49
  The +set_storage+ method can be called with the following arguments:
50
50
 
51
- * _model_ A Ruby constant specifying the model to be used for storing the extracted data (optional; when this argument is not provided, a simple-non validating model will be generated on the fly).
52
- * _session_title_ A human readable name for the extracted dataset.
53
-
54
- == Running the test suite
55
-
56
- The test suite can be run by executing the +rspec\ \*+ command from within the +spec/+ directory.
51
+ * _model_ A Ruby constant specifying the model to be used for storing the extracted data .
52
+ * _session_title_ A human readable title for the extracted dataset (optional).
57
53
 
@@ -1,18 +1,9 @@
1
1
  require "rubygems"
2
- require "extraloop"
3
2
  require "date"
4
- require "./lib/amazon_review_scraper.rb"
3
+ require "extraloop"
5
4
  require "../lib/extraloop/redis-storage.rb"
6
-
7
- class AmazonReview < ExtraLoop::Storage::Record
8
- attribute :title
9
- attribute :rank
10
- attribute :date
11
-
12
- def validate
13
- assert (0..5).include?(rank.to_i), "Rank not in range"
14
- end
15
- end
5
+ require "./lib/models/amazon_review.rb"
6
+ require "./lib/scrapers/amazon_review_scraper.rb"
16
7
 
17
8
  scraper = AmazonReviewScraper.new("0262560992").
18
9
  set_storage(AmazonReview).
@@ -12,6 +12,6 @@ google_news_scraper = ExtraLoop::IterativeScraper.new("https://www.google.com/se
12
12
  set_storage(:GoogleNewsStory).
13
13
  run
14
14
 
15
- puts "#{GoogleNewsStory.all.size} news stories fetched..."
15
+ puts "#{GoogleNewsStory.all.to_a.size} news stories fetched..."
16
16
 
17
17
 
@@ -0,0 +1,9 @@
1
+ class AmazonReview < ExtraLoop::Storage::Record
2
+ attribute :title
3
+ attribute :rank
4
+ attribute :date
5
+
6
+ def validate
7
+ assert (0..5).include?(rank.to_i), "Rank not in range"
8
+ end
9
+ end
@@ -1,30 +1,33 @@
1
1
  require "json"
2
2
  require "rubygems"
3
3
  require "redis"
4
- require 'pry'
5
4
  require "ohm"
6
5
  require "ohm/contrib"
7
6
  require "extraloop"
8
7
 
9
8
  base_path = File.realpath(File.dirname(__FILE__))
10
9
  $: << "#{base_path}"
11
- require "scraper_base"
12
-
13
10
 
11
+ require "scraper_base"
14
12
 
15
13
  module ExtraLoop
16
14
  module Storage
17
15
  VERSION ||= "0.0.1"
18
16
 
19
- class << self
20
- def connect(*args)
21
- Ohm.connect(*args)
22
- end
17
+ def self.connect(*args)
18
+ Ohm.connect(*args)
19
+ end
20
+
21
+ # Tries to automatically locate the models directory and load all ruby files within in
22
+ def self.autoload_models(dirname='models')
23
+ Dir["**/**#{dirname}/*.rb"].each { |path| require "./#{path}" }
23
24
  end
24
25
  end
25
26
  end
26
27
 
27
28
  ExtraLoop::Storage.autoload :Record, "#{base_path}/redis-storage/record.rb"
28
29
  ExtraLoop::Storage.autoload :ScrapingSession, "#{base_path}/redis-storage/scraping_session.rb"
30
+ ExtraLoop::Storage.autoload :Model, "#{base_path}/redis-storage/model.rb"
29
31
  ExtraLoop::Storage.autoload :DatasetFactory, "#{base_path}/redis-storage/dataset_factory.rb"
30
32
 
33
+
@@ -6,7 +6,8 @@ class ExtraLoop::Storage::DatasetFactory
6
6
 
7
7
  @classname = (classname.to_s.split "").each_with_index.map { |char, index| index == 0 && char.upcase or char }.join
8
8
 
9
- return if Object.const_defined? @classname
9
+ return Object.const_get @classname if Object.const_defined? @classname
10
+
10
11
  Object.const_set(@classname, Class.new(ExtraLoop::Storage::Record) {
11
12
  attributes.each { |attr| attribute attr }
12
13
  })
@@ -0,0 +1,13 @@
1
+ # Meta model designed to keep track of what types of records
2
+ # are associated to a Scraping session object.
3
+ #
4
+ class ExtraLoop::Storage::Model < Ohm::Model
5
+ attribute :name
6
+ index :name
7
+
8
+ def to_hash
9
+ super.merge(attributes.reduce({}) { |memo, attribute|
10
+ memo.merge(attribute => send(attribute))
11
+ })
12
+ end
13
+ end
@@ -2,11 +2,50 @@ class ExtraLoop::Storage::ScrapingSession < Ohm::Model
2
2
 
3
3
  include Ohm::Boundaries
4
4
  include Ohm::Timestamping
5
+ include Ohm::Callbacks
5
6
 
6
7
  attribute :title
8
+ reference :model, ExtraLoop::Storage::Model
7
9
 
8
- def records(collection)
9
- Kernel.const_get(collection.to_s).
10
- find(:session_id => self.id)
10
+ def records(params={})
11
+ klass = if Object.const_defined?(model.name)
12
+ Object.const_get(model.name)
13
+ else
14
+ dynamic_class = Class.new(ExtraLoop::Storage::Record) do
15
+ # override default to_hash so that it will return the Redis hash
16
+ # internally stored by Ohm
17
+ def to_hash
18
+ Ohm.redis.hgetall self.key
19
+ end
20
+ end
21
+
22
+ Object.const_set(model.name, dynamic_class)
23
+ dynamic_class
24
+ end
25
+
26
+ # set a session index, so that Ohm finder will work
27
+ klass.indices << :session_id unless klass.indices.include? :session_id
28
+
29
+ klass.find({
30
+ :session_id => self.id
31
+ }.merge(params))
32
+ end
33
+
34
+ def validate
35
+ assert_present :model
36
+ end
37
+
38
+ def to_hash
39
+ attrs = attributes.reduce({}) { |memo, attribute|
40
+ memo.merge(attribute => send(attribute))
41
+ }.merge({
42
+ :records => records.map(&:to_hash),
43
+ :model => model.to_hash
44
+ })
45
+
46
+ super.merge attrs
47
+ end
48
+
49
+ def to_csv
11
50
  end
12
51
  end
@@ -4,9 +4,9 @@ class ExtraLoop::ScraperBase
4
4
  def set_storage(model, title=nil)
5
5
  collection_name = "#{Time.now.to_i} #{model.to_s} Dataset"
6
6
  title ||= collection_name
7
- log_session! title
8
7
 
9
8
  @model = model_klass = model.respond_to?(:new) && model || ExtraLoop::Storage::DatasetFactory.new(model.to_sym, @extractor_args.map(&:first)).get_class
9
+ log_session! title
10
10
 
11
11
  on :data do |results|
12
12
  results = results.map { |result| @scraper.send(:instanciate_model, result) }
@@ -17,7 +17,12 @@ class ExtraLoop::ScraperBase
17
17
  protected
18
18
  # Creates a scraping session
19
19
  def log_session!(title="")
20
- @session ||= ExtraLoop::Storage::ScrapingSession.create :title => title
20
+ if !@session
21
+ ns = ExtraLoop::Storage
22
+ results = ns::Model.find :name => @model
23
+ model = results.any? && results.first || ns::Model.create(:name => @model)
24
+ @session = ns::ScrapingSession.create :title => title, :model => model
25
+ end
21
26
  end
22
27
 
23
28
  # Converts extracted records into instances of the dataset model specified as the first argument
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: extraloop-redis-storage
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-21 00:00:00.000000000Z
12
+ date: 2012-02-26 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: extraloop
16
- requirement: &19118640 !ruby/object:Gem::Requirement
16
+ requirement: &18201600 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 0.0.3
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *19118640
24
+ version_requirements: *18201600
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: ohm
27
- requirement: &19118180 !ruby/object:Gem::Requirement
27
+ requirement: &18201140 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 0.1.3
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *19118180
35
+ version_requirements: *18201140
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ohm-contrib
38
- requirement: &19117720 !ruby/object:Gem::Requirement
38
+ requirement: &18200680 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,21 @@ dependencies:
43
43
  version: 0.1.2
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *19117720
46
+ version_requirements: *18200680
47
+ - !ruby/object:Gem::Dependency
48
+ name: thor
49
+ requirement: &18200080 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - =
53
+ - !ruby/object:Gem::Version
54
+ version: 0.14.6
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *18200080
47
58
  - !ruby/object:Gem::Dependency
48
59
  name: rspec
49
- requirement: &19117260 !ruby/object:Gem::Requirement
60
+ requirement: &18199420 !ruby/object:Gem::Requirement
50
61
  none: false
51
62
  requirements:
52
63
  - - ~>
@@ -54,10 +65,10 @@ dependencies:
54
65
  version: 2.7.0
55
66
  type: :development
56
67
  prerelease: false
57
- version_requirements: *19117260
68
+ version_requirements: *18199420
58
69
  - !ruby/object:Gem::Dependency
59
70
  name: rr
60
- requirement: &19116800 !ruby/object:Gem::Requirement
71
+ requirement: &18198820 !ruby/object:Gem::Requirement
61
72
  none: false
62
73
  requirements:
63
74
  - - ~>
@@ -65,10 +76,10 @@ dependencies:
65
76
  version: 1.0.4
66
77
  type: :development
67
78
  prerelease: false
68
- version_requirements: *19116800
79
+ version_requirements: *18198820
69
80
  - !ruby/object:Gem::Dependency
70
81
  name: pry
71
- requirement: &19116340 !ruby/object:Gem::Requirement
82
+ requirement: &18187500 !ruby/object:Gem::Requirement
72
83
  none: false
73
84
  requirements:
74
85
  - - ~>
@@ -76,7 +87,7 @@ dependencies:
76
87
  version: 0.9.7.4
77
88
  type: :development
78
89
  prerelease: false
79
- version_requirements: *19116340
90
+ version_requirements: *18187500
80
91
  description: Redis+Ohm based storage for data sets extracted using the ExtraLoop toolkit.
81
92
  email: andrea.giulio.fiore@googlemail.com
82
93
  executables: []
@@ -87,9 +98,11 @@ files:
87
98
  - README.rdoc
88
99
  - examples/amazon_reviews.rb
89
100
  - examples/google_news_scraper.rb
90
- - examples/lib/amazon_review_scraper.rb
101
+ - examples/lib/models/amazon_review.rb
102
+ - examples/lib/scrapers/amazon_review_scraper.rb
91
103
  - lib/extraloop/redis-storage.rb
92
104
  - lib/extraloop/redis-storage/dataset_factory.rb
105
+ - lib/extraloop/redis-storage/model.rb
93
106
  - lib/extraloop/redis-storage/record.rb
94
107
  - lib/extraloop/redis-storage/scraping_session.rb
95
108
  - lib/extraloop/scraper_base.rb