pupa 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -0
- data/lib/pupa/models/concerns/contactable.rb +6 -1
- data/lib/pupa/models/concerns/identifiable.rb +6 -1
- data/lib/pupa/models/concerns/linkable.rb +6 -1
- data/lib/pupa/models/concerns/nameable.rb +6 -1
- data/lib/pupa/models/concerns/sourceable.rb +6 -1
- data/lib/pupa/processor/client.rb +3 -3
- data/lib/pupa/processor/persistence.rb +3 -3
- data/lib/pupa/processor.rb +7 -6
- data/lib/pupa/refinements/json-schema.rb +18 -10
- data/lib/pupa/runner.rb +3 -3
- data/lib/pupa/version.rb +1 -1
- data/spec/models/concerns/contactable_spec.rb +12 -0
- data/spec/models/concerns/identifiable_spec.rb +12 -0
- data/spec/processor/persistence_spec.rb +3 -3
- data/spec/processor_spec.rb +4 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1e1915f53def27a04384e080b43b7eaa1c024398
|
4
|
+
data.tar.gz: 75b0c4e987b976448a816b3b36be99942c393ddd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5800302c7e32286138dc9f64b8dc06cd13c07cb4cfc5478d572b8fdc10c04483d595c7f48ceddb82efe4330ce280dc3b58467be50db3fa264e1429a24b3b7e0f
|
7
|
+
data.tar.gz: 6cf10d376b218ce4190b4b08da3dbaa1f3fddf87763623f23368eccdccc1a305d611bb3eb8a0de37c55af961eca5ef0a88f9d613bdc08b9e59139c5fc5194b30
|
data/README.md
CHANGED
@@ -155,6 +155,8 @@ Note that Pupa.rb flushes the Redis database before scraping. If you use Redis,
|
|
155
155
|
|
156
156
|
The `json-schema` gem is slow compared to, for example, [JSV](https://github.com/garycourt/JSV). Setting the `--no-validate` switch and running JSON Schema validations separately can further reduce a scraper's running time.
|
157
157
|
|
158
|
+
The [pupa-validate](https://npmjs.org/package/pupa-validate) npm package can be used to validate JSON documents using the faster JSV. In an example case, using JSV instead of the `json-schema` gem reduced by half the time to validate 10,000 documents.
|
159
|
+
|
158
160
|
### Parsing JSON
|
159
161
|
|
160
162
|
If the rest of your scraper is fast, you may see an improvement by using the `oj` gem. Just `require 'oj'` and Pupa.rb will automatically pick it up, since it uses [MultiJson](https://github.com/intridea/multi_json).
|
@@ -9,6 +9,11 @@ module Pupa
|
|
9
9
|
dump :contact_details
|
10
10
|
end
|
11
11
|
|
12
|
+
def initialize(*args)
|
13
|
+
@contact_details = ContactDetailList.new
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
12
17
|
# Sets the contact details.
|
13
18
|
#
|
14
19
|
# @param [Array] contact_details a list of contact details
|
@@ -27,7 +32,7 @@ module Pupa
|
|
27
32
|
data[:note] = note
|
28
33
|
end
|
29
34
|
if type && value
|
30
|
-
|
35
|
+
@contact_details << data
|
31
36
|
end
|
32
37
|
end
|
33
38
|
end
|
@@ -9,6 +9,11 @@ module Pupa
|
|
9
9
|
dump :identifiers
|
10
10
|
end
|
11
11
|
|
12
|
+
def initialize(*args)
|
13
|
+
@identifiers = IdentifierList.new
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
12
17
|
# Sets the identifiers.
|
13
18
|
#
|
14
19
|
# @param [Array] identifiers a list of identifiers
|
@@ -26,7 +31,7 @@ module Pupa
|
|
26
31
|
data[:scheme] = scheme
|
27
32
|
end
|
28
33
|
if identifier
|
29
|
-
|
34
|
+
@identifiers << data
|
30
35
|
end
|
31
36
|
end
|
32
37
|
end
|
@@ -9,6 +9,11 @@ module Pupa
|
|
9
9
|
dump :links
|
10
10
|
end
|
11
11
|
|
12
|
+
def initialize(*args)
|
13
|
+
@links = []
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
12
17
|
# Adds a URL.
|
13
18
|
#
|
14
19
|
# @param [String] url a URL
|
@@ -19,7 +24,7 @@ module Pupa
|
|
19
24
|
data[:note] = note
|
20
25
|
end
|
21
26
|
if url
|
22
|
-
|
27
|
+
@links << data
|
23
28
|
end
|
24
29
|
end
|
25
30
|
end
|
@@ -9,6 +9,11 @@ module Pupa
|
|
9
9
|
dump :other_names
|
10
10
|
end
|
11
11
|
|
12
|
+
def initialize(*args)
|
13
|
+
@other_names = []
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
12
17
|
# Adds an alternate or former name.
|
13
18
|
#
|
14
19
|
# @param [String] name an alternate or former name
|
@@ -27,7 +32,7 @@ module Pupa
|
|
27
32
|
data[:note] = note
|
28
33
|
end
|
29
34
|
if name
|
30
|
-
|
35
|
+
@other_names << data
|
31
36
|
end
|
32
37
|
end
|
33
38
|
end
|
@@ -9,6 +9,11 @@ module Pupa
|
|
9
9
|
dump :sources
|
10
10
|
end
|
11
11
|
|
12
|
+
def initialize(*args)
|
13
|
+
@sources = []
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
12
17
|
# Adds a source to the object.
|
13
18
|
#
|
14
19
|
# @param [String] url a URL
|
@@ -19,7 +24,7 @@ module Pupa
|
|
19
24
|
data[:note] = note
|
20
25
|
end
|
21
26
|
if url
|
22
|
-
|
27
|
+
@sources << data
|
23
28
|
end
|
24
29
|
end
|
25
30
|
end
|
@@ -36,17 +36,17 @@ module Pupa
|
|
36
36
|
connection.use Middleware::RaiseError # useful for breaking concurrent requests
|
37
37
|
|
38
38
|
# @see http://tools.ietf.org/html/rfc4627
|
39
|
-
connection.use Middleware::ParseJson, content_type: /\bjson$/
|
39
|
+
connection.use Middleware::ParseJson, preserve_raw: true, content_type: /\bjson$/
|
40
40
|
|
41
41
|
# @see http://tools.ietf.org/html/rfc2854
|
42
42
|
# @see http://tools.ietf.org/html/rfc3236
|
43
43
|
if defined?(Nokogiri)
|
44
|
-
connection.use Middleware::ParseHtml, content_type: %w(text/html application/xhtml+xml)
|
44
|
+
connection.use Middleware::ParseHtml, preserve_raw: true, content_type: %w(text/html application/xhtml+xml)
|
45
45
|
end
|
46
46
|
|
47
47
|
# @see http://tools.ietf.org/html/rfc3023
|
48
48
|
if defined?(MultiXml)
|
49
|
-
connection.use FaradayMiddleware::ParseXml, content_type: /\bxml$/
|
49
|
+
connection.use FaradayMiddleware::ParseXml, preserve_raw: true, content_type: /\bxml$/
|
50
50
|
end
|
51
51
|
|
52
52
|
if cache_dir
|
@@ -30,7 +30,7 @@ module Pupa
|
|
30
30
|
|
31
31
|
# Saves an object to MongoDB.
|
32
32
|
#
|
33
|
-
# @return [
|
33
|
+
# @return [Array] whether the object was inserted and the object's database ID
|
34
34
|
# @raises [Pupa::Errors::TooManyMatches] if multiple documents would be updated
|
35
35
|
def save
|
36
36
|
selector = @object.fingerprint
|
@@ -42,11 +42,11 @@ module Pupa
|
|
42
42
|
when 0
|
43
43
|
@object.run_callbacks(:create) do
|
44
44
|
collection.insert(@object.to_h(persist: true))
|
45
|
-
@object._id.to_s
|
45
|
+
[true, @object._id.to_s]
|
46
46
|
end
|
47
47
|
when 1
|
48
48
|
query.update(@object.to_h(persist: true))
|
49
|
-
query.first['_id'].to_s
|
49
|
+
[false, query.first['_id'].to_s]
|
50
50
|
else
|
51
51
|
raise Errors::TooManyMatches, "selector matches multiple documents during save: #{collection_name} #{MultiJson.dump(selector)}"
|
52
52
|
end
|
data/lib/pupa/processor.rb
CHANGED
@@ -116,16 +116,17 @@ module Pupa
|
|
116
116
|
# Dumps scraped objects to disk.
|
117
117
|
#
|
118
118
|
# @param [Symbol] task_name the name of the scraping task to perform
|
119
|
-
# @return [
|
119
|
+
# @return [Hash] the number of scraped objects by type
|
120
|
+
# @raises [Pupa::Errors::DuplicateObjectIdError]
|
120
121
|
def dump_scraped_objects(task_name)
|
121
|
-
|
122
|
+
counts = Hash.new(0)
|
122
123
|
@store.pipelined do
|
123
124
|
send(task_name).each do |object|
|
124
|
-
|
125
|
+
counts[object._type] += 1
|
125
126
|
dump_scraped_object(object)
|
126
127
|
end
|
127
128
|
end
|
128
|
-
|
129
|
+
counts
|
129
130
|
end
|
130
131
|
|
131
132
|
# Saves scraped objects to a database.
|
@@ -371,9 +372,9 @@ module Pupa
|
|
371
372
|
|
372
373
|
# @param [Object] object an object
|
373
374
|
def import_object(object)
|
374
|
-
id = Persistence.new(object).save
|
375
|
+
inserted, id = Persistence.new(object).save
|
375
376
|
@report[:import][object._type] ||= Hash.new(0)
|
376
|
-
if
|
377
|
+
if inserted
|
377
378
|
@report[:import][object._type][:insert] += 1
|
378
379
|
else
|
379
380
|
@report[:import][object._type][:update] += 1
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'mail'
|
2
|
+
|
1
3
|
module Pupa
|
2
4
|
module Refinements
|
3
5
|
# A refinement for JSON Schema to validate "email" and "uri" formats. Using
|
@@ -8,20 +10,26 @@ module Pupa
|
|
8
10
|
def validate(current_schema, data, fragments, processor, validator, options = {})
|
9
11
|
case current_schema.schema['format']
|
10
12
|
when 'email'
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
13
|
+
if String === data
|
14
|
+
address = Mail::Address.new(data)
|
15
|
+
unless (address.address == data && address.domain && address.__send__(:tree).domain.dot_atom_text.elements.size > 1 rescue false)
|
16
|
+
error_message = "The property '#{build_fragment(fragments)}' must be a valid email address (#{data})"
|
17
|
+
validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
|
18
|
+
end
|
19
|
+
else
|
20
|
+
error_message = "The property '#{build_fragment(fragments)}' must be a string (#{data})"
|
15
21
|
validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
|
16
|
-
return
|
17
22
|
end
|
18
23
|
when 'uri'
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
24
|
+
if String === data
|
25
|
+
re = URI::DEFAULT_PARSER.regexp[:ABS_URI]
|
26
|
+
unless re.match(data)
|
27
|
+
error_message = "The property '#{build_fragment(fragments)}' must be a valid URI (#{data})"
|
28
|
+
validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
|
29
|
+
end
|
30
|
+
else
|
31
|
+
error_message = "The property '#{build_fragment(fragments)}' must be string (#{data})"
|
23
32
|
validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
|
24
|
-
return
|
25
33
|
end
|
26
34
|
else
|
27
35
|
super
|
data/lib/pupa/runner.rb
CHANGED
@@ -73,7 +73,7 @@ module Pupa
|
|
73
73
|
opts.on('-t', '--task TASK', @processor_class.tasks, 'Select a scraping task to run (you may give this switch multiple times)', " (#{@processor_class.tasks.join(', ')})") do |v|
|
74
74
|
options.tasks << v
|
75
75
|
end
|
76
|
-
opts.on('-o', '--output_dir PATH', 'The directory or Redis address (e.g. redis://localhost:6379) in which to dump JSON documents') do |v|
|
76
|
+
opts.on('-o', '--output_dir PATH', 'The directory or Redis address (e.g. redis://localhost:6379/0) in which to dump JSON documents') do |v|
|
77
77
|
options.output_dir = v
|
78
78
|
end
|
79
79
|
opts.on('-c', '--cache_dir PATH', 'The directory or Memcached address (e.g. memcached://localhost:11211) in which to cache HTTP requests') do |v|
|
@@ -178,8 +178,8 @@ module Pupa
|
|
178
178
|
report = {
|
179
179
|
plan: {
|
180
180
|
processor: @processor_class,
|
181
|
-
|
182
|
-
|
181
|
+
options: Marshal.load(Marshal.dump(options)).to_h,
|
182
|
+
arguments: rest,
|
183
183
|
},
|
184
184
|
start: Time.now.utc,
|
185
185
|
}
|
data/lib/pupa/version.rb
CHANGED
@@ -12,6 +12,18 @@ describe Pupa::Concerns::Contactable do
|
|
12
12
|
klass.new
|
13
13
|
end
|
14
14
|
|
15
|
+
describe '#initialize' do
|
16
|
+
it 'should initialize an empty ContactDetailList' do
|
17
|
+
object.contact_details.should be_a(Pupa::ContactDetailList)
|
18
|
+
object.contact_details.should == []
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'should initialize the given ContactDetailList' do
|
22
|
+
object = klass.new(contact_details: [{type: 'email', value: 'ceo@example.com', note: 'work'}])
|
23
|
+
object.contact_details.should == [{type: 'email', value: 'ceo@example.com', note: 'work'}]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
15
27
|
describe '#contact_details=' do
|
16
28
|
it 'should use coerce to a ContactDetailList' do
|
17
29
|
object.contact_details = [{type: 'email', value: 'ceo@example.com', note: 'work'}]
|
@@ -12,6 +12,18 @@ describe Pupa::Concerns::Identifiable do
|
|
12
12
|
klass.new
|
13
13
|
end
|
14
14
|
|
15
|
+
describe '#initialize' do
|
16
|
+
it 'should initialize an empty IdentifierList' do
|
17
|
+
object.identifiers.should be_a(Pupa::IdentifierList)
|
18
|
+
object.identifiers.should == []
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'should initialize the given IdentifierList' do
|
22
|
+
object = klass.new(identifiers: [{identifier: '123456789', scheme: 'DUNS'}])
|
23
|
+
object.identifiers.should == [{identifier: '123456789', scheme: 'DUNS'}]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
15
27
|
describe '#identifiers=' do
|
16
28
|
it 'should use coerce to a IdentifierList' do
|
17
29
|
object.identifiers = [{identifier: '123456789', scheme: 'DUNS'}]
|
@@ -5,7 +5,7 @@ describe Pupa::Processor::Persistence do
|
|
5
5
|
Pupa.session = Moped::Session.new(['localhost:27017'], database: 'pupa_test')
|
6
6
|
Pupa.session.collections.each(&:drop)
|
7
7
|
|
8
|
-
Pupa::Processor::Persistence.new(Pupa::Person.new(name: 'existing')).save
|
8
|
+
Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'existing', name: 'existing')).save
|
9
9
|
|
10
10
|
Pupa.session[:people].insert(_type: 'pupa/person', name: 'non-unique')
|
11
11
|
Pupa.session[:people].insert(_type: 'pupa/person', name: 'non-unique')
|
@@ -27,11 +27,11 @@ describe Pupa::Processor::Persistence do
|
|
27
27
|
|
28
28
|
describe '#save' do
|
29
29
|
it 'should insert a document if no matches' do
|
30
|
-
Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'new', name: 'new')).save.should == 'new'
|
30
|
+
Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'new', name: 'new')).save.should == [true, 'new']
|
31
31
|
end
|
32
32
|
|
33
33
|
it 'should update a document if one match' do
|
34
|
-
Pupa::Processor::Persistence.new(Pupa::Person.new(_id: '
|
34
|
+
Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'changed', name: 'existing')).save.should == [false, 'existing']
|
35
35
|
end
|
36
36
|
|
37
37
|
it 'should raise an error if many matches' do
|
data/spec/processor_spec.rb
CHANGED
@@ -67,6 +67,10 @@ describe Pupa::Processor do
|
|
67
67
|
path = "/tmp/person_#{processor.person._id}.json"
|
68
68
|
end
|
69
69
|
|
70
|
+
it 'should return the number of scraped objects by type' do
|
71
|
+
processor.dump_scraped_objects(:people).should == {'pupa/person' => 1}
|
72
|
+
end
|
73
|
+
|
70
74
|
it 'should not overwrite an existing file' do
|
71
75
|
File.open(path, 'w') {}
|
72
76
|
expect{processor.dump_scraped_objects(:people)}.to raise_error(Pupa::Errors::DuplicateObjectIdError)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pupa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Open North
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-10-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|