pupa 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pupa/models/base.rb +14 -1
- data/lib/pupa/processor.rb +25 -4
- data/lib/pupa/runner.rb +18 -1
- data/lib/pupa/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 796eaf63778076cbe361cae74186cdef026f88b2
|
4
|
+
data.tar.gz: d90a3bccc968f958b137254817c548e7b1b11a5b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5a42c57c20190b0678e60bb17832525b911e4f33253212be5dbf27475816f567e23f98be870173793bd11681d510ae760912cacbb1916c671c81bad75ad5ab81
|
7
|
+
data.tar.gz: b307018aecc3d92fa976c9c3a6c2a7fafb548a4a3a57aa3b2a6aa5c4c7fd029c64ebfb9de6947e6202dc223ca3806893e921146188a9a839c3c7993711b4053d
|
data/lib/pupa/models/base.rb
CHANGED
@@ -32,7 +32,8 @@ module Pupa
|
|
32
32
|
# Declare the class' properties.
|
33
33
|
#
|
34
34
|
# When converting an object to a hash using the `to_h` method, only the
|
35
|
-
# properties declared with `attr_accessor` will be
|
35
|
+
# properties declared with `attr_accessor` or `attr_reader` will be
|
36
|
+
# included in the hash.
|
36
37
|
#
|
37
38
|
# @param [Array<Symbol>] the class' properties
|
38
39
|
def attr_accessor(*attributes)
|
@@ -40,6 +41,18 @@ module Pupa
|
|
40
41
|
super
|
41
42
|
end
|
42
43
|
|
44
|
+
# Declare the class' properties.
|
45
|
+
#
|
46
|
+
# When converting an object to a hash using the `to_h` method, only the
|
47
|
+
# properties declared with `attr_accessor` or `attr_reader` will be
|
48
|
+
# included in the hash.
|
49
|
+
#
|
50
|
+
# @param [Array<Symbol>] the class' properties
|
51
|
+
def attr_reader(*attributes)
|
52
|
+
self.properties += attributes # use assignment to not overwrite the parent's attribute
|
53
|
+
super
|
54
|
+
end
|
55
|
+
|
43
56
|
# Declare the class' foreign keys.
|
44
57
|
#
|
45
58
|
# When importing scraped objects, the foreign keys will be used to draw a
|
data/lib/pupa/processor.rb
CHANGED
@@ -17,6 +17,8 @@ module Pupa
|
|
17
17
|
class_attribute :tasks
|
18
18
|
self.tasks = []
|
19
19
|
|
20
|
+
attr_reader :report
|
21
|
+
|
20
22
|
def_delegators :@logger, :debug, :info, :warn, :error, :fatal
|
21
23
|
|
22
24
|
# @param [String] output_dir the directory in which to dump JSON documents
|
@@ -31,6 +33,7 @@ module Pupa
|
|
31
33
|
@level = level
|
32
34
|
@logger = Logger.new('pupa', level: level, logdev: logdev)
|
33
35
|
@client = Client.new(cache_dir: cache_dir, expires_in: expires_in, level: level)
|
36
|
+
@report = {}
|
34
37
|
end
|
35
38
|
|
36
39
|
# Retrieves and parses a document with a GET request.
|
@@ -100,10 +103,14 @@ module Pupa
|
|
100
103
|
# Dumps scraped objects to disk.
|
101
104
|
#
|
102
105
|
# @param [Symbol] task_name the name of the scraping task to perform
|
106
|
+
# @return [Integer] the number of scraped objects
|
103
107
|
def dump_scraped_objects(task_name)
|
108
|
+
count = 0
|
104
109
|
send(task_name).each do |object|
|
110
|
+
count += 1 # we don't know the size of the enumeration
|
105
111
|
dump_scraped_object(object)
|
106
112
|
end
|
113
|
+
count
|
107
114
|
end
|
108
115
|
|
109
116
|
# Saves scraped objects to a database.
|
@@ -114,6 +121,8 @@ module Pupa
|
|
114
121
|
# @raises [Pupa::Errors::DuplicateDocumentError] if duplicate objects were
|
115
122
|
# inadvertently saved to the database
|
116
123
|
def import
|
124
|
+
@report[:import] = {}
|
125
|
+
|
117
126
|
objects = deduplicate(load_scraped_objects)
|
118
127
|
|
119
128
|
object_id_to_database_id = {}
|
@@ -126,7 +135,7 @@ module Pupa
|
|
126
135
|
object = objects[id]
|
127
136
|
resolve_foreign_keys(object, object_id_to_database_id)
|
128
137
|
# The dependency graph strategy only works if there are no foreign objects.
|
129
|
-
object_id_to_database_id[id] =
|
138
|
+
object_id_to_database_id[id] = import_object(object)
|
130
139
|
end
|
131
140
|
else
|
132
141
|
size = objects.size
|
@@ -158,7 +167,7 @@ module Pupa
|
|
158
167
|
progress_made = true
|
159
168
|
resolve_foreign_keys(object, object_id_to_database_id)
|
160
169
|
resolve_foreign_objects(object)
|
161
|
-
object_id_to_database_id[id] =
|
170
|
+
object_id_to_database_id[id] = import_object(object)
|
162
171
|
end
|
163
172
|
end
|
164
173
|
|
@@ -204,7 +213,7 @@ module Pupa
|
|
204
213
|
# @raises [Pupa::Errors::DuplicateObjectIdError]
|
205
214
|
def dump_scraped_object(object)
|
206
215
|
type = object.class.to_s.demodulize.underscore
|
207
|
-
basename = "#{type}_#{object._id}.json"
|
216
|
+
basename = "#{type}_#{object._id.gsub(File::SEPARATOR, '_')}.json"
|
208
217
|
path = File.join(@output_dir, basename)
|
209
218
|
|
210
219
|
if File.exist?(path)
|
@@ -336,7 +345,7 @@ module Pupa
|
|
336
345
|
|
337
346
|
# Resolves an object's foreign objects to database IDs.
|
338
347
|
#
|
339
|
-
# @param [Object] an object
|
348
|
+
# @param [Object] object an object
|
340
349
|
# @raises [Pupa::Errors::MissingDatabaseIdError]
|
341
350
|
def resolve_foreign_objects(object)
|
342
351
|
object.foreign_objects.each do |property|
|
@@ -347,5 +356,17 @@ module Pupa
|
|
347
356
|
end
|
348
357
|
end
|
349
358
|
end
|
359
|
+
|
360
|
+
# @param [Object] object an object
|
361
|
+
def import_object(object)
|
362
|
+
id = Persistence.new(object).save
|
363
|
+
@report[:import][object._type] ||= Hash.new(0)
|
364
|
+
if id == object._id
|
365
|
+
@report[:import][object._type][:insert] += 1
|
366
|
+
else
|
367
|
+
@report[:import][object._type][:update] += 1
|
368
|
+
end
|
369
|
+
id
|
370
|
+
end
|
350
371
|
end
|
351
372
|
end
|
data/lib/pupa/runner.rb
CHANGED
@@ -162,6 +162,15 @@ module Pupa
|
|
162
162
|
|
163
163
|
exit if options.dry_run
|
164
164
|
|
165
|
+
report = {
|
166
|
+
plan: {
|
167
|
+
processor: @processor_class,
|
168
|
+
arguments: options.to_h,
|
169
|
+
options: rest,
|
170
|
+
},
|
171
|
+
start: Time.now.utc,
|
172
|
+
}
|
173
|
+
|
165
174
|
Pupa.session = Moped::Session.new([options.host_with_port], database: options.database)
|
166
175
|
|
167
176
|
if options.actions.delete('scrape')
|
@@ -172,14 +181,22 @@ module Pupa
|
|
172
181
|
FileUtils.rm(path)
|
173
182
|
end
|
174
183
|
|
184
|
+
report[:scrape] = {}
|
175
185
|
options.tasks.each do |task_name|
|
176
|
-
processor.dump_scraped_objects(task_name)
|
186
|
+
report[:scrape][task_name] = processor.dump_scraped_objects(task_name)
|
177
187
|
end
|
178
188
|
end
|
179
189
|
|
180
190
|
options.actions.each do |action|
|
181
191
|
processor.send(action)
|
192
|
+
if processor.report.key?(action.to_sym)
|
193
|
+
report.update(action.to_sym => processor.report[action.to_sym])
|
194
|
+
end
|
182
195
|
end
|
196
|
+
|
197
|
+
report[:end] = Time.now.utc
|
198
|
+
report[:time] = report[:end] - report[:start]
|
199
|
+
puts JSON.dump(report)
|
183
200
|
end
|
184
201
|
end
|
185
202
|
end
|
data/lib/pupa/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pupa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Open North
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-09-
|
11
|
+
date: 2013-09-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|