pupa 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/pupa/models/base.rb +14 -1
- data/lib/pupa/processor.rb +25 -4
- data/lib/pupa/runner.rb +18 -1
- data/lib/pupa/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 796eaf63778076cbe361cae74186cdef026f88b2
|
4
|
+
data.tar.gz: d90a3bccc968f958b137254817c548e7b1b11a5b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5a42c57c20190b0678e60bb17832525b911e4f33253212be5dbf27475816f567e23f98be870173793bd11681d510ae760912cacbb1916c671c81bad75ad5ab81
|
7
|
+
data.tar.gz: b307018aecc3d92fa976c9c3a6c2a7fafb548a4a3a57aa3b2a6aa5c4c7fd029c64ebfb9de6947e6202dc223ca3806893e921146188a9a839c3c7993711b4053d
|
data/lib/pupa/models/base.rb
CHANGED
@@ -32,7 +32,8 @@ module Pupa
|
|
32
32
|
# Declare the class' properties.
|
33
33
|
#
|
34
34
|
# When converting an object to a hash using the `to_h` method, only the
|
35
|
-
# properties declared with `attr_accessor` will be
|
35
|
+
# properties declared with `attr_accessor` or `attr_reader` will be
|
36
|
+
# included in the hash.
|
36
37
|
#
|
37
38
|
# @param [Array<Symbol>] the class' properties
|
38
39
|
def attr_accessor(*attributes)
|
@@ -40,6 +41,18 @@ module Pupa
|
|
40
41
|
super
|
41
42
|
end
|
42
43
|
|
44
|
+
# Declare the class' properties.
|
45
|
+
#
|
46
|
+
# When converting an object to a hash using the `to_h` method, only the
|
47
|
+
# properties declared with `attr_accessor` or `attr_reader` will be
|
48
|
+
# included in the hash.
|
49
|
+
#
|
50
|
+
# @param [Array<Symbol>] the class' properties
|
51
|
+
def attr_reader(*attributes)
|
52
|
+
self.properties += attributes # use assignment to not overwrite the parent's attribute
|
53
|
+
super
|
54
|
+
end
|
55
|
+
|
43
56
|
# Declare the class' foreign keys.
|
44
57
|
#
|
45
58
|
# When importing scraped objects, the foreign keys will be used to draw a
|
data/lib/pupa/processor.rb
CHANGED
@@ -17,6 +17,8 @@ module Pupa
|
|
17
17
|
class_attribute :tasks
|
18
18
|
self.tasks = []
|
19
19
|
|
20
|
+
attr_reader :report
|
21
|
+
|
20
22
|
def_delegators :@logger, :debug, :info, :warn, :error, :fatal
|
21
23
|
|
22
24
|
# @param [String] output_dir the directory in which to dump JSON documents
|
@@ -31,6 +33,7 @@ module Pupa
|
|
31
33
|
@level = level
|
32
34
|
@logger = Logger.new('pupa', level: level, logdev: logdev)
|
33
35
|
@client = Client.new(cache_dir: cache_dir, expires_in: expires_in, level: level)
|
36
|
+
@report = {}
|
34
37
|
end
|
35
38
|
|
36
39
|
# Retrieves and parses a document with a GET request.
|
@@ -100,10 +103,14 @@ module Pupa
|
|
100
103
|
# Dumps scraped objects to disk.
|
101
104
|
#
|
102
105
|
# @param [Symbol] task_name the name of the scraping task to perform
|
106
|
+
# @return [Integer] the number of scraped objects
|
103
107
|
def dump_scraped_objects(task_name)
|
108
|
+
count = 0
|
104
109
|
send(task_name).each do |object|
|
110
|
+
count += 1 # we don't know the size of the enumeration
|
105
111
|
dump_scraped_object(object)
|
106
112
|
end
|
113
|
+
count
|
107
114
|
end
|
108
115
|
|
109
116
|
# Saves scraped objects to a database.
|
@@ -114,6 +121,8 @@ module Pupa
|
|
114
121
|
# @raises [Pupa::Errors::DuplicateDocumentError] if duplicate objects were
|
115
122
|
# inadvertently saved to the database
|
116
123
|
def import
|
124
|
+
@report[:import] = {}
|
125
|
+
|
117
126
|
objects = deduplicate(load_scraped_objects)
|
118
127
|
|
119
128
|
object_id_to_database_id = {}
|
@@ -126,7 +135,7 @@ module Pupa
|
|
126
135
|
object = objects[id]
|
127
136
|
resolve_foreign_keys(object, object_id_to_database_id)
|
128
137
|
# The dependency graph strategy only works if there are no foreign objects.
|
129
|
-
object_id_to_database_id[id] =
|
138
|
+
object_id_to_database_id[id] = import_object(object)
|
130
139
|
end
|
131
140
|
else
|
132
141
|
size = objects.size
|
@@ -158,7 +167,7 @@ module Pupa
|
|
158
167
|
progress_made = true
|
159
168
|
resolve_foreign_keys(object, object_id_to_database_id)
|
160
169
|
resolve_foreign_objects(object)
|
161
|
-
object_id_to_database_id[id] =
|
170
|
+
object_id_to_database_id[id] = import_object(object)
|
162
171
|
end
|
163
172
|
end
|
164
173
|
|
@@ -204,7 +213,7 @@ module Pupa
|
|
204
213
|
# @raises [Pupa::Errors::DuplicateObjectIdError]
|
205
214
|
def dump_scraped_object(object)
|
206
215
|
type = object.class.to_s.demodulize.underscore
|
207
|
-
basename = "#{type}_#{object._id}.json"
|
216
|
+
basename = "#{type}_#{object._id.gsub(File::SEPARATOR, '_')}.json"
|
208
217
|
path = File.join(@output_dir, basename)
|
209
218
|
|
210
219
|
if File.exist?(path)
|
@@ -336,7 +345,7 @@ module Pupa
|
|
336
345
|
|
337
346
|
# Resolves an object's foreign objects to database IDs.
|
338
347
|
#
|
339
|
-
# @param [Object] an object
|
348
|
+
# @param [Object] object an object
|
340
349
|
# @raises [Pupa::Errors::MissingDatabaseIdError]
|
341
350
|
def resolve_foreign_objects(object)
|
342
351
|
object.foreign_objects.each do |property|
|
@@ -347,5 +356,17 @@ module Pupa
|
|
347
356
|
end
|
348
357
|
end
|
349
358
|
end
|
359
|
+
|
360
|
+
# @param [Object] object an object
|
361
|
+
def import_object(object)
|
362
|
+
id = Persistence.new(object).save
|
363
|
+
@report[:import][object._type] ||= Hash.new(0)
|
364
|
+
if id == object._id
|
365
|
+
@report[:import][object._type][:insert] += 1
|
366
|
+
else
|
367
|
+
@report[:import][object._type][:update] += 1
|
368
|
+
end
|
369
|
+
id
|
370
|
+
end
|
350
371
|
end
|
351
372
|
end
|
data/lib/pupa/runner.rb
CHANGED
@@ -162,6 +162,15 @@ module Pupa
|
|
162
162
|
|
163
163
|
exit if options.dry_run
|
164
164
|
|
165
|
+
report = {
|
166
|
+
plan: {
|
167
|
+
processor: @processor_class,
|
168
|
+
arguments: options.to_h,
|
169
|
+
options: rest,
|
170
|
+
},
|
171
|
+
start: Time.now.utc,
|
172
|
+
}
|
173
|
+
|
165
174
|
Pupa.session = Moped::Session.new([options.host_with_port], database: options.database)
|
166
175
|
|
167
176
|
if options.actions.delete('scrape')
|
@@ -172,14 +181,22 @@ module Pupa
|
|
172
181
|
FileUtils.rm(path)
|
173
182
|
end
|
174
183
|
|
184
|
+
report[:scrape] = {}
|
175
185
|
options.tasks.each do |task_name|
|
176
|
-
processor.dump_scraped_objects(task_name)
|
186
|
+
report[:scrape][task_name] = processor.dump_scraped_objects(task_name)
|
177
187
|
end
|
178
188
|
end
|
179
189
|
|
180
190
|
options.actions.each do |action|
|
181
191
|
processor.send(action)
|
192
|
+
if processor.report.key?(action.to_sym)
|
193
|
+
report.update(action.to_sym => processor.report[action.to_sym])
|
194
|
+
end
|
182
195
|
end
|
196
|
+
|
197
|
+
report[:end] = Time.now.utc
|
198
|
+
report[:time] = report[:end] - report[:start]
|
199
|
+
puts JSON.dump(report)
|
183
200
|
end
|
184
201
|
end
|
185
202
|
end
|
data/lib/pupa/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pupa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Open North
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-09-
|
11
|
+
date: 2013-09-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|