pupa 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -0
- data/lib/pupa/models/concerns/contactable.rb +6 -1
- data/lib/pupa/models/concerns/identifiable.rb +6 -1
- data/lib/pupa/models/concerns/linkable.rb +6 -1
- data/lib/pupa/models/concerns/nameable.rb +6 -1
- data/lib/pupa/models/concerns/sourceable.rb +6 -1
- data/lib/pupa/processor/client.rb +3 -3
- data/lib/pupa/processor/persistence.rb +3 -3
- data/lib/pupa/processor.rb +7 -6
- data/lib/pupa/refinements/json-schema.rb +18 -10
- data/lib/pupa/runner.rb +3 -3
- data/lib/pupa/version.rb +1 -1
- data/spec/models/concerns/contactable_spec.rb +12 -0
- data/spec/models/concerns/identifiable_spec.rb +12 -0
- data/spec/processor/persistence_spec.rb +3 -3
- data/spec/processor_spec.rb +4 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1e1915f53def27a04384e080b43b7eaa1c024398
|
4
|
+
data.tar.gz: 75b0c4e987b976448a816b3b36be99942c393ddd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5800302c7e32286138dc9f64b8dc06cd13c07cb4cfc5478d572b8fdc10c04483d595c7f48ceddb82efe4330ce280dc3b58467be50db3fa264e1429a24b3b7e0f
|
7
|
+
data.tar.gz: 6cf10d376b218ce4190b4b08da3dbaa1f3fddf87763623f23368eccdccc1a305d611bb3eb8a0de37c55af961eca5ef0a88f9d613bdc08b9e59139c5fc5194b30
|
data/README.md
CHANGED
@@ -155,6 +155,8 @@ Note that Pupa.rb flushes the Redis database before scraping. If you use Redis,
|
|
155
155
|
|
156
156
|
The `json-schema` gem is slow compared to, for example, [JSV](https://github.com/garycourt/JSV). Setting the `--no-validate` switch and running JSON Schema validations separately can further reduce a scraper's running time.
|
157
157
|
|
158
|
+
The [pupa-validate](https://npmjs.org/package/pupa-validate) npm package can be used to validate JSON documents using the faster JSV. In an example case, using JSV instead of the `json-schema` gem reduced by half the time to validate 10,000 documents.
|
159
|
+
|
158
160
|
### Parsing JSON
|
159
161
|
|
160
162
|
If the rest of your scraper is fast, you may see an improvement by using the `oj` gem. Just `require 'oj'` and Pupa.rb will automatically pick it up, since it uses [MultiJson](https://github.com/intridea/multi_json).
|
@@ -9,6 +9,11 @@ module Pupa
|
|
9
9
|
dump :contact_details
|
10
10
|
end
|
11
11
|
|
12
|
+
def initialize(*args)
|
13
|
+
@contact_details = ContactDetailList.new
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
12
17
|
# Sets the contact details.
|
13
18
|
#
|
14
19
|
# @param [Array] contact_details a list of contact details
|
@@ -27,7 +32,7 @@ module Pupa
|
|
27
32
|
data[:note] = note
|
28
33
|
end
|
29
34
|
if type && value
|
30
|
-
|
35
|
+
@contact_details << data
|
31
36
|
end
|
32
37
|
end
|
33
38
|
end
|
@@ -9,6 +9,11 @@ module Pupa
|
|
9
9
|
dump :identifiers
|
10
10
|
end
|
11
11
|
|
12
|
+
def initialize(*args)
|
13
|
+
@identifiers = IdentifierList.new
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
12
17
|
# Sets the identifiers.
|
13
18
|
#
|
14
19
|
# @param [Array] identifiers a list of identifiers
|
@@ -26,7 +31,7 @@ module Pupa
|
|
26
31
|
data[:scheme] = scheme
|
27
32
|
end
|
28
33
|
if identifier
|
29
|
-
|
34
|
+
@identifiers << data
|
30
35
|
end
|
31
36
|
end
|
32
37
|
end
|
@@ -9,6 +9,11 @@ module Pupa
|
|
9
9
|
dump :links
|
10
10
|
end
|
11
11
|
|
12
|
+
def initialize(*args)
|
13
|
+
@links = []
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
12
17
|
# Adds a URL.
|
13
18
|
#
|
14
19
|
# @param [String] url a URL
|
@@ -19,7 +24,7 @@ module Pupa
|
|
19
24
|
data[:note] = note
|
20
25
|
end
|
21
26
|
if url
|
22
|
-
|
27
|
+
@links << data
|
23
28
|
end
|
24
29
|
end
|
25
30
|
end
|
@@ -9,6 +9,11 @@ module Pupa
|
|
9
9
|
dump :other_names
|
10
10
|
end
|
11
11
|
|
12
|
+
def initialize(*args)
|
13
|
+
@other_names = []
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
12
17
|
# Adds an alternate or former name.
|
13
18
|
#
|
14
19
|
# @param [String] name an alternate or former name
|
@@ -27,7 +32,7 @@ module Pupa
|
|
27
32
|
data[:note] = note
|
28
33
|
end
|
29
34
|
if name
|
30
|
-
|
35
|
+
@other_names << data
|
31
36
|
end
|
32
37
|
end
|
33
38
|
end
|
@@ -9,6 +9,11 @@ module Pupa
|
|
9
9
|
dump :sources
|
10
10
|
end
|
11
11
|
|
12
|
+
def initialize(*args)
|
13
|
+
@sources = []
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
12
17
|
# Adds a source to the object.
|
13
18
|
#
|
14
19
|
# @param [String] url a URL
|
@@ -19,7 +24,7 @@ module Pupa
|
|
19
24
|
data[:note] = note
|
20
25
|
end
|
21
26
|
if url
|
22
|
-
|
27
|
+
@sources << data
|
23
28
|
end
|
24
29
|
end
|
25
30
|
end
|
@@ -36,17 +36,17 @@ module Pupa
|
|
36
36
|
connection.use Middleware::RaiseError # useful for breaking concurrent requests
|
37
37
|
|
38
38
|
# @see http://tools.ietf.org/html/rfc4627
|
39
|
-
connection.use Middleware::ParseJson, content_type: /\bjson$/
|
39
|
+
connection.use Middleware::ParseJson, preserve_raw: true, content_type: /\bjson$/
|
40
40
|
|
41
41
|
# @see http://tools.ietf.org/html/rfc2854
|
42
42
|
# @see http://tools.ietf.org/html/rfc3236
|
43
43
|
if defined?(Nokogiri)
|
44
|
-
connection.use Middleware::ParseHtml, content_type: %w(text/html application/xhtml+xml)
|
44
|
+
connection.use Middleware::ParseHtml, preserve_raw: true, content_type: %w(text/html application/xhtml+xml)
|
45
45
|
end
|
46
46
|
|
47
47
|
# @see http://tools.ietf.org/html/rfc3023
|
48
48
|
if defined?(MultiXml)
|
49
|
-
connection.use FaradayMiddleware::ParseXml, content_type: /\bxml$/
|
49
|
+
connection.use FaradayMiddleware::ParseXml, preserve_raw: true, content_type: /\bxml$/
|
50
50
|
end
|
51
51
|
|
52
52
|
if cache_dir
|
@@ -30,7 +30,7 @@ module Pupa
|
|
30
30
|
|
31
31
|
# Saves an object to MongoDB.
|
32
32
|
#
|
33
|
-
# @return [
|
33
|
+
# @return [Array] whether the object was inserted and the object's database ID
|
34
34
|
# @raises [Pupa::Errors::TooManyMatches] if multiple documents would be updated
|
35
35
|
def save
|
36
36
|
selector = @object.fingerprint
|
@@ -42,11 +42,11 @@ module Pupa
|
|
42
42
|
when 0
|
43
43
|
@object.run_callbacks(:create) do
|
44
44
|
collection.insert(@object.to_h(persist: true))
|
45
|
-
@object._id.to_s
|
45
|
+
[true, @object._id.to_s]
|
46
46
|
end
|
47
47
|
when 1
|
48
48
|
query.update(@object.to_h(persist: true))
|
49
|
-
query.first['_id'].to_s
|
49
|
+
[false, query.first['_id'].to_s]
|
50
50
|
else
|
51
51
|
raise Errors::TooManyMatches, "selector matches multiple documents during save: #{collection_name} #{MultiJson.dump(selector)}"
|
52
52
|
end
|
data/lib/pupa/processor.rb
CHANGED
@@ -116,16 +116,17 @@ module Pupa
|
|
116
116
|
# Dumps scraped objects to disk.
|
117
117
|
#
|
118
118
|
# @param [Symbol] task_name the name of the scraping task to perform
|
119
|
-
# @return [
|
119
|
+
# @return [Hash] the number of scraped objects by type
|
120
|
+
# @raises [Pupa::Errors::DuplicateObjectIdError]
|
120
121
|
def dump_scraped_objects(task_name)
|
121
|
-
|
122
|
+
counts = Hash.new(0)
|
122
123
|
@store.pipelined do
|
123
124
|
send(task_name).each do |object|
|
124
|
-
|
125
|
+
counts[object._type] += 1
|
125
126
|
dump_scraped_object(object)
|
126
127
|
end
|
127
128
|
end
|
128
|
-
|
129
|
+
counts
|
129
130
|
end
|
130
131
|
|
131
132
|
# Saves scraped objects to a database.
|
@@ -371,9 +372,9 @@ module Pupa
|
|
371
372
|
|
372
373
|
# @param [Object] object an object
|
373
374
|
def import_object(object)
|
374
|
-
id = Persistence.new(object).save
|
375
|
+
inserted, id = Persistence.new(object).save
|
375
376
|
@report[:import][object._type] ||= Hash.new(0)
|
376
|
-
if
|
377
|
+
if inserted
|
377
378
|
@report[:import][object._type][:insert] += 1
|
378
379
|
else
|
379
380
|
@report[:import][object._type][:update] += 1
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'mail'
|
2
|
+
|
1
3
|
module Pupa
|
2
4
|
module Refinements
|
3
5
|
# A refinement for JSON Schema to validate "email" and "uri" formats. Using
|
@@ -8,20 +10,26 @@ module Pupa
|
|
8
10
|
def validate(current_schema, data, fragments, processor, validator, options = {})
|
9
11
|
case current_schema.schema['format']
|
10
12
|
when 'email'
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
13
|
+
if String === data
|
14
|
+
address = Mail::Address.new(data)
|
15
|
+
unless (address.address == data && address.domain && address.__send__(:tree).domain.dot_atom_text.elements.size > 1 rescue false)
|
16
|
+
error_message = "The property '#{build_fragment(fragments)}' must be a valid email address (#{data})"
|
17
|
+
validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
|
18
|
+
end
|
19
|
+
else
|
20
|
+
error_message = "The property '#{build_fragment(fragments)}' must be a string (#{data})"
|
15
21
|
validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
|
16
|
-
return
|
17
22
|
end
|
18
23
|
when 'uri'
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
24
|
+
if String === data
|
25
|
+
re = URI::DEFAULT_PARSER.regexp[:ABS_URI]
|
26
|
+
unless re.match(data)
|
27
|
+
error_message = "The property '#{build_fragment(fragments)}' must be a valid URI (#{data})"
|
28
|
+
validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
|
29
|
+
end
|
30
|
+
else
|
31
|
+
error_message = "The property '#{build_fragment(fragments)}' must be string (#{data})"
|
23
32
|
validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
|
24
|
-
return
|
25
33
|
end
|
26
34
|
else
|
27
35
|
super
|
data/lib/pupa/runner.rb
CHANGED
@@ -73,7 +73,7 @@ module Pupa
|
|
73
73
|
opts.on('-t', '--task TASK', @processor_class.tasks, 'Select a scraping task to run (you may give this switch multiple times)', " (#{@processor_class.tasks.join(', ')})") do |v|
|
74
74
|
options.tasks << v
|
75
75
|
end
|
76
|
-
opts.on('-o', '--output_dir PATH', 'The directory or Redis address (e.g. redis://localhost:6379) in which to dump JSON documents') do |v|
|
76
|
+
opts.on('-o', '--output_dir PATH', 'The directory or Redis address (e.g. redis://localhost:6379/0) in which to dump JSON documents') do |v|
|
77
77
|
options.output_dir = v
|
78
78
|
end
|
79
79
|
opts.on('-c', '--cache_dir PATH', 'The directory or Memcached address (e.g. memcached://localhost:11211) in which to cache HTTP requests') do |v|
|
@@ -178,8 +178,8 @@ module Pupa
|
|
178
178
|
report = {
|
179
179
|
plan: {
|
180
180
|
processor: @processor_class,
|
181
|
-
|
182
|
-
|
181
|
+
options: Marshal.load(Marshal.dump(options)).to_h,
|
182
|
+
arguments: rest,
|
183
183
|
},
|
184
184
|
start: Time.now.utc,
|
185
185
|
}
|
data/lib/pupa/version.rb
CHANGED
@@ -12,6 +12,18 @@ describe Pupa::Concerns::Contactable do
|
|
12
12
|
klass.new
|
13
13
|
end
|
14
14
|
|
15
|
+
describe '#initialize' do
|
16
|
+
it 'should initialize an empty ContactDetailList' do
|
17
|
+
object.contact_details.should be_a(Pupa::ContactDetailList)
|
18
|
+
object.contact_details.should == []
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'should initialize the given ContactDetailList' do
|
22
|
+
object = klass.new(contact_details: [{type: 'email', value: 'ceo@example.com', note: 'work'}])
|
23
|
+
object.contact_details.should == [{type: 'email', value: 'ceo@example.com', note: 'work'}]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
15
27
|
describe '#contact_details=' do
|
16
28
|
it 'should use coerce to a ContactDetailList' do
|
17
29
|
object.contact_details = [{type: 'email', value: 'ceo@example.com', note: 'work'}]
|
@@ -12,6 +12,18 @@ describe Pupa::Concerns::Identifiable do
|
|
12
12
|
klass.new
|
13
13
|
end
|
14
14
|
|
15
|
+
describe '#initialize' do
|
16
|
+
it 'should initialize an empty IdentifierList' do
|
17
|
+
object.identifiers.should be_a(Pupa::IdentifierList)
|
18
|
+
object.identifiers.should == []
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'should initialize the given IdentifierList' do
|
22
|
+
object = klass.new(identifiers: [{identifier: '123456789', scheme: 'DUNS'}])
|
23
|
+
object.identifiers.should == [{identifier: '123456789', scheme: 'DUNS'}]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
15
27
|
describe '#identifiers=' do
|
16
28
|
it 'should use coerce to a IdentifierList' do
|
17
29
|
object.identifiers = [{identifier: '123456789', scheme: 'DUNS'}]
|
@@ -5,7 +5,7 @@ describe Pupa::Processor::Persistence do
|
|
5
5
|
Pupa.session = Moped::Session.new(['localhost:27017'], database: 'pupa_test')
|
6
6
|
Pupa.session.collections.each(&:drop)
|
7
7
|
|
8
|
-
Pupa::Processor::Persistence.new(Pupa::Person.new(name: 'existing')).save
|
8
|
+
Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'existing', name: 'existing')).save
|
9
9
|
|
10
10
|
Pupa.session[:people].insert(_type: 'pupa/person', name: 'non-unique')
|
11
11
|
Pupa.session[:people].insert(_type: 'pupa/person', name: 'non-unique')
|
@@ -27,11 +27,11 @@ describe Pupa::Processor::Persistence do
|
|
27
27
|
|
28
28
|
describe '#save' do
|
29
29
|
it 'should insert a document if no matches' do
|
30
|
-
Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'new', name: 'new')).save.should == 'new'
|
30
|
+
Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'new', name: 'new')).save.should == [true, 'new']
|
31
31
|
end
|
32
32
|
|
33
33
|
it 'should update a document if one match' do
|
34
|
-
Pupa::Processor::Persistence.new(Pupa::Person.new(_id: '
|
34
|
+
Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'changed', name: 'existing')).save.should == [false, 'existing']
|
35
35
|
end
|
36
36
|
|
37
37
|
it 'should raise an error if many matches' do
|
data/spec/processor_spec.rb
CHANGED
@@ -67,6 +67,10 @@ describe Pupa::Processor do
|
|
67
67
|
path = "/tmp/person_#{processor.person._id}.json"
|
68
68
|
end
|
69
69
|
|
70
|
+
it 'should return the number of scraped objects by type' do
|
71
|
+
processor.dump_scraped_objects(:people).should == {'pupa/person' => 1}
|
72
|
+
end
|
73
|
+
|
70
74
|
it 'should not overwrite an existing file' do
|
71
75
|
File.open(path, 'w') {}
|
72
76
|
expect{processor.dump_scraped_objects(:people)}.to raise_error(Pupa::Errors::DuplicateObjectIdError)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pupa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Open North
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-10-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|