pupa 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b9732b4a0ea1dbd2eb9d8fff1d533bbfd397db6a
4
- data.tar.gz: 659874bfd458daa7c09be7f0f2a5feeb9deec005
3
+ metadata.gz: 796eaf63778076cbe361cae74186cdef026f88b2
4
+ data.tar.gz: d90a3bccc968f958b137254817c548e7b1b11a5b
5
5
  SHA512:
6
- metadata.gz: f24268480ad0447df95bbfa82a86c13d4d82e015309932571b102b73813d480e52d35124ca900f94d9740d58d1b8f3f84e2b9b484dc8c3b879ab6efec8593683
7
- data.tar.gz: e09836eb51002368554645177bbfbee70e91b3dd5900746e26aa10f3196adb5651e455b1d3a87617e325381169367debc5af0b3d3f2ace604aa8d517c8fc585a
6
+ metadata.gz: 5a42c57c20190b0678e60bb17832525b911e4f33253212be5dbf27475816f567e23f98be870173793bd11681d510ae760912cacbb1916c671c81bad75ad5ab81
7
+ data.tar.gz: b307018aecc3d92fa976c9c3a6c2a7fafb548a4a3a57aa3b2a6aa5c4c7fd029c64ebfb9de6947e6202dc223ca3806893e921146188a9a839c3c7993711b4053d
@@ -32,7 +32,8 @@ module Pupa
32
32
  # Declare the class' properties.
33
33
  #
34
34
  # When converting an object to a hash using the `to_h` method, only the
35
- # properties declared with `attr_accessor` will be included in the hash.
35
+ # properties declared with `attr_accessor` or `attr_reader` will be
36
+ # included in the hash.
36
37
  #
37
38
  # @param [Array<Symbol>] the class' properties
38
39
  def attr_accessor(*attributes)
@@ -40,6 +41,18 @@ module Pupa
40
41
  super
41
42
  end
42
43
 
44
+ # Declare the class' properties.
45
+ #
46
+ # When converting an object to a hash using the `to_h` method, only the
47
+ # properties declared with `attr_accessor` or `attr_reader` will be
48
+ # included in the hash.
49
+ #
50
+ # @param [Array<Symbol>] the class' properties
51
+ def attr_reader(*attributes)
52
+ self.properties += attributes # use assignment to not overwrite the parent's attribute
53
+ super
54
+ end
55
+
43
56
  # Declare the class' foreign keys.
44
57
  #
45
58
  # When importing scraped objects, the foreign keys will be used to draw a
@@ -17,6 +17,8 @@ module Pupa
17
17
  class_attribute :tasks
18
18
  self.tasks = []
19
19
 
20
+ attr_reader :report
21
+
20
22
  def_delegators :@logger, :debug, :info, :warn, :error, :fatal
21
23
 
22
24
  # @param [String] output_dir the directory in which to dump JSON documents
@@ -31,6 +33,7 @@ module Pupa
31
33
  @level = level
32
34
  @logger = Logger.new('pupa', level: level, logdev: logdev)
33
35
  @client = Client.new(cache_dir: cache_dir, expires_in: expires_in, level: level)
36
+ @report = {}
34
37
  end
35
38
 
36
39
  # Retrieves and parses a document with a GET request.
@@ -100,10 +103,14 @@ module Pupa
100
103
  # Dumps scraped objects to disk.
101
104
  #
102
105
  # @param [Symbol] task_name the name of the scraping task to perform
106
+ # @return [Integer] the number of scraped objects
103
107
  def dump_scraped_objects(task_name)
108
+ count = 0
104
109
  send(task_name).each do |object|
110
+ count += 1 # we don't know the size of the enumeration
105
111
  dump_scraped_object(object)
106
112
  end
113
+ count
107
114
  end
108
115
 
109
116
  # Saves scraped objects to a database.
@@ -114,6 +121,8 @@ module Pupa
114
121
  # @raises [Pupa::Errors::DuplicateDocumentError] if duplicate objects were
115
122
  # inadvertently saved to the database
116
123
  def import
124
+ @report[:import] = {}
125
+
117
126
  objects = deduplicate(load_scraped_objects)
118
127
 
119
128
  object_id_to_database_id = {}
@@ -126,7 +135,7 @@ module Pupa
126
135
  object = objects[id]
127
136
  resolve_foreign_keys(object, object_id_to_database_id)
128
137
  # The dependency graph strategy only works if there are no foreign objects.
129
- object_id_to_database_id[id] = Persistence.new(object).save
138
+ object_id_to_database_id[id] = import_object(object)
130
139
  end
131
140
  else
132
141
  size = objects.size
@@ -158,7 +167,7 @@ module Pupa
158
167
  progress_made = true
159
168
  resolve_foreign_keys(object, object_id_to_database_id)
160
169
  resolve_foreign_objects(object)
161
- object_id_to_database_id[id] = Persistence.new(object).save
170
+ object_id_to_database_id[id] = import_object(object)
162
171
  end
163
172
  end
164
173
 
@@ -204,7 +213,7 @@ module Pupa
204
213
  # @raises [Pupa::Errors::DuplicateObjectIdError]
205
214
  def dump_scraped_object(object)
206
215
  type = object.class.to_s.demodulize.underscore
207
- basename = "#{type}_#{object._id}.json"
216
+ basename = "#{type}_#{object._id.gsub(File::SEPARATOR, '_')}.json"
208
217
  path = File.join(@output_dir, basename)
209
218
 
210
219
  if File.exist?(path)
@@ -336,7 +345,7 @@ module Pupa
336
345
 
337
346
  # Resolves an object's foreign objects to database IDs.
338
347
  #
339
- # @param [Object] an object
348
+ # @param [Object] object an object
340
349
  # @raises [Pupa::Errors::MissingDatabaseIdError]
341
350
  def resolve_foreign_objects(object)
342
351
  object.foreign_objects.each do |property|
@@ -347,5 +356,17 @@ module Pupa
347
356
  end
348
357
  end
349
358
  end
359
+
360
+ # @param [Object] object an object
361
+ def import_object(object)
362
+ id = Persistence.new(object).save
363
+ @report[:import][object._type] ||= Hash.new(0)
364
+ if id == object._id
365
+ @report[:import][object._type][:insert] += 1
366
+ else
367
+ @report[:import][object._type][:update] += 1
368
+ end
369
+ id
370
+ end
350
371
  end
351
372
  end
data/lib/pupa/runner.rb CHANGED
@@ -162,6 +162,15 @@ module Pupa
162
162
 
163
163
  exit if options.dry_run
164
164
 
165
+ report = {
166
+ plan: {
167
+ processor: @processor_class,
168
+ arguments: options.to_h,
169
+ options: rest,
170
+ },
171
+ start: Time.now.utc,
172
+ }
173
+
165
174
  Pupa.session = Moped::Session.new([options.host_with_port], database: options.database)
166
175
 
167
176
  if options.actions.delete('scrape')
@@ -172,14 +181,22 @@ module Pupa
172
181
  FileUtils.rm(path)
173
182
  end
174
183
 
184
+ report[:scrape] = {}
175
185
  options.tasks.each do |task_name|
176
- processor.dump_scraped_objects(task_name)
186
+ report[:scrape][task_name] = processor.dump_scraped_objects(task_name)
177
187
  end
178
188
  end
179
189
 
180
190
  options.actions.each do |action|
181
191
  processor.send(action)
192
+ if processor.report.key?(action.to_sym)
193
+ report.update(action.to_sym => processor.report[action.to_sym])
194
+ end
182
195
  end
196
+
197
+ report[:end] = Time.now.utc
198
+ report[:time] = report[:end] - report[:start]
199
+ puts JSON.dump(report)
183
200
  end
184
201
  end
185
202
  end
data/lib/pupa/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Pupa
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pupa
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Open North
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-09-16 00:00:00.000000000 Z
11
+ date: 2013-09-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport