pupa 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8124ac65b9df49b337205ce22fe6491ead5a03ec
4
- data.tar.gz: e28195cea41f576dea3fa58def6ccfa403d2cf8c
3
+ metadata.gz: 1e1915f53def27a04384e080b43b7eaa1c024398
4
+ data.tar.gz: 75b0c4e987b976448a816b3b36be99942c393ddd
5
5
  SHA512:
6
- metadata.gz: acce9361e6ec70f4daf26bbc205724b25dddb7da5437f4fcbca6b7c5fb2df2a56f22e2f6c1cdcd63fac49c5f61c9ed052336223021c40dab522de0eecdaae562
7
- data.tar.gz: 606a7695c7a0d722c43e6a574397dfd1870452f972d2d2a7147e67424e750645fdf2c9b5c18edb67dce05d03df1cfa3bc5535c8cda07ae9063a8795ddf4708cb
6
+ metadata.gz: 5800302c7e32286138dc9f64b8dc06cd13c07cb4cfc5478d572b8fdc10c04483d595c7f48ceddb82efe4330ce280dc3b58467be50db3fa264e1429a24b3b7e0f
7
+ data.tar.gz: 6cf10d376b218ce4190b4b08da3dbaa1f3fddf87763623f23368eccdccc1a305d611bb3eb8a0de37c55af961eca5ef0a88f9d613bdc08b9e59139c5fc5194b30
data/README.md CHANGED
@@ -155,6 +155,8 @@ Note that Pupa.rb flushes the Redis database before scraping. If you use Redis,
155
155
 
156
156
  The `json-schema` gem is slow compared to, for example, [JSV](https://github.com/garycourt/JSV). Setting the `--no-validate` switch and running JSON Schema validations separately can further reduce a scraper's running time.
157
157
 
158
+ The [pupa-validate](https://npmjs.org/package/pupa-validate) npm package can be used to validate JSON documents using the faster JSV. In an example case, using JSV instead of the `json-schema` gem reduced by half the time to validate 10,000 documents.
159
+
158
160
  ### Parsing JSON
159
161
 
160
162
  If the rest of your scraper is fast, you may see an improvement by using the `oj` gem. Just `require 'oj'` and Pupa.rb will automatically pick it up, since it uses [MultiJson](https://github.com/intridea/multi_json).
@@ -9,6 +9,11 @@ module Pupa
9
9
  dump :contact_details
10
10
  end
11
11
 
12
+ def initialize(*args)
13
+ @contact_details = ContactDetailList.new
14
+ super
15
+ end
16
+
12
17
  # Sets the contact details.
13
18
  #
14
19
  # @param [Array] contact_details a list of contact details
@@ -27,7 +32,7 @@ module Pupa
27
32
  data[:note] = note
28
33
  end
29
34
  if type && value
30
- (@contact_details ||= ContactDetailList.new) << data
35
+ @contact_details << data
31
36
  end
32
37
  end
33
38
  end
@@ -9,6 +9,11 @@ module Pupa
9
9
  dump :identifiers
10
10
  end
11
11
 
12
+ def initialize(*args)
13
+ @identifiers = IdentifierList.new
14
+ super
15
+ end
16
+
12
17
  # Sets the identifiers.
13
18
  #
14
19
  # @param [Array] identifiers a list of identifiers
@@ -26,7 +31,7 @@ module Pupa
26
31
  data[:scheme] = scheme
27
32
  end
28
33
  if identifier
29
- (@identifiers ||= []) << data
34
+ @identifiers << data
30
35
  end
31
36
  end
32
37
  end
@@ -9,6 +9,11 @@ module Pupa
9
9
  dump :links
10
10
  end
11
11
 
12
+ def initialize(*args)
13
+ @links = []
14
+ super
15
+ end
16
+
12
17
  # Adds a URL.
13
18
  #
14
19
  # @param [String] url a URL
@@ -19,7 +24,7 @@ module Pupa
19
24
  data[:note] = note
20
25
  end
21
26
  if url
22
- (@links ||= []) << data
27
+ @links << data
23
28
  end
24
29
  end
25
30
  end
@@ -9,6 +9,11 @@ module Pupa
9
9
  dump :other_names
10
10
  end
11
11
 
12
+ def initialize(*args)
13
+ @other_names = []
14
+ super
15
+ end
16
+
12
17
  # Adds an alternate or former name.
13
18
  #
14
19
  # @param [String] name an alternate or former name
@@ -27,7 +32,7 @@ module Pupa
27
32
  data[:note] = note
28
33
  end
29
34
  if name
30
- (@other_names ||= []) << data
35
+ @other_names << data
31
36
  end
32
37
  end
33
38
  end
@@ -9,6 +9,11 @@ module Pupa
9
9
  dump :sources
10
10
  end
11
11
 
12
+ def initialize(*args)
13
+ @sources = []
14
+ super
15
+ end
16
+
12
17
  # Adds a source to the object.
13
18
  #
14
19
  # @param [String] url a URL
@@ -19,7 +24,7 @@ module Pupa
19
24
  data[:note] = note
20
25
  end
21
26
  if url
22
- (@sources ||= []) << data
27
+ @sources << data
23
28
  end
24
29
  end
25
30
  end
@@ -36,17 +36,17 @@ module Pupa
36
36
  connection.use Middleware::RaiseError # useful for breaking concurrent requests
37
37
 
38
38
  # @see http://tools.ietf.org/html/rfc4627
39
- connection.use Middleware::ParseJson, content_type: /\bjson$/
39
+ connection.use Middleware::ParseJson, preserve_raw: true, content_type: /\bjson$/
40
40
 
41
41
  # @see http://tools.ietf.org/html/rfc2854
42
42
  # @see http://tools.ietf.org/html/rfc3236
43
43
  if defined?(Nokogiri)
44
- connection.use Middleware::ParseHtml, content_type: %w(text/html application/xhtml+xml)
44
+ connection.use Middleware::ParseHtml, preserve_raw: true, content_type: %w(text/html application/xhtml+xml)
45
45
  end
46
46
 
47
47
  # @see http://tools.ietf.org/html/rfc3023
48
48
  if defined?(MultiXml)
49
- connection.use FaradayMiddleware::ParseXml, content_type: /\bxml$/
49
+ connection.use FaradayMiddleware::ParseXml, preserve_raw: true, content_type: /\bxml$/
50
50
  end
51
51
 
52
52
  if cache_dir
@@ -30,7 +30,7 @@ module Pupa
30
30
 
31
31
  # Saves an object to MongoDB.
32
32
  #
33
- # @return [String] the object's database ID
33
+ # @return [Array] whether the object was inserted and the object's database ID
34
34
  # @raises [Pupa::Errors::TooManyMatches] if multiple documents would be updated
35
35
  def save
36
36
  selector = @object.fingerprint
@@ -42,11 +42,11 @@ module Pupa
42
42
  when 0
43
43
  @object.run_callbacks(:create) do
44
44
  collection.insert(@object.to_h(persist: true))
45
- @object._id.to_s
45
+ [true, @object._id.to_s]
46
46
  end
47
47
  when 1
48
48
  query.update(@object.to_h(persist: true))
49
- query.first['_id'].to_s
49
+ [false, query.first['_id'].to_s]
50
50
  else
51
51
  raise Errors::TooManyMatches, "selector matches multiple documents during save: #{collection_name} #{MultiJson.dump(selector)}"
52
52
  end
@@ -116,16 +116,17 @@ module Pupa
116
116
  # Dumps scraped objects to disk.
117
117
  #
118
118
  # @param [Symbol] task_name the name of the scraping task to perform
119
- # @return [Integer] the number of scraped objects
119
+ # @return [Hash] the number of scraped objects by type
120
+ # @raises [Pupa::Errors::DuplicateObjectIdError]
120
121
  def dump_scraped_objects(task_name)
121
- count = 0
122
+ counts = Hash.new(0)
122
123
  @store.pipelined do
123
124
  send(task_name).each do |object|
124
- count += 1 # we don't know the size of the enumeration
125
+ counts[object._type] += 1
125
126
  dump_scraped_object(object)
126
127
  end
127
128
  end
128
- count
129
+ counts
129
130
  end
130
131
 
131
132
  # Saves scraped objects to a database.
@@ -371,9 +372,9 @@ module Pupa
371
372
 
372
373
  # @param [Object] object an object
373
374
  def import_object(object)
374
- id = Persistence.new(object).save
375
+ inserted, id = Persistence.new(object).save
375
376
  @report[:import][object._type] ||= Hash.new(0)
376
- if id == object._id
377
+ if inserted
377
378
  @report[:import][object._type][:insert] += 1
378
379
  else
379
380
  @report[:import][object._type][:update] += 1
@@ -1,3 +1,5 @@
1
+ require 'mail'
2
+
1
3
  module Pupa
2
4
  module Refinements
3
5
  # A refinement for JSON Schema to validate "email" and "uri" formats. Using
@@ -8,20 +10,26 @@ module Pupa
8
10
  def validate(current_schema, data, fragments, processor, validator, options = {})
9
11
  case current_schema.schema['format']
10
12
  when 'email'
11
- error_message = "The property '#{build_fragment(fragments)}' must be a valid email address"
12
- validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors]) and return if !data.is_a?(String)
13
- address = Mail::Address.new(data)
14
- unless (address.address == data && address.domain && address.__send__(:tree).domain.dot_atom_text.elements.size > 1 rescue false)
13
+ if String === data
14
+ address = Mail::Address.new(data)
15
+ unless (address.address == data && address.domain && address.__send__(:tree).domain.dot_atom_text.elements.size > 1 rescue false)
16
+ error_message = "The property '#{build_fragment(fragments)}' must be a valid email address (#{data})"
17
+ validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
18
+ end
19
+ else
20
+ error_message = "The property '#{build_fragment(fragments)}' must be a string (#{data})"
15
21
  validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
16
- return
17
22
  end
18
23
  when 'uri'
19
- error_message = "The property '#{build_fragment(fragments)}' must be a valid URI"
20
- validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors]) and return if !data.is_a?(String)
21
- r = URI::DEFAULT_PARSER.regexp[:ABS_URI]
22
- unless r.match(data)
24
+ if String === data
25
+ re = URI::DEFAULT_PARSER.regexp[:ABS_URI]
26
+ unless re.match(data)
27
+ error_message = "The property '#{build_fragment(fragments)}' must be a valid URI (#{data})"
28
+ validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
29
+ end
30
+ else
31
+ error_message = "The property '#{build_fragment(fragments)}' must be string (#{data})"
23
32
  validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
24
- return
25
33
  end
26
34
  else
27
35
  super
data/lib/pupa/runner.rb CHANGED
@@ -73,7 +73,7 @@ module Pupa
73
73
  opts.on('-t', '--task TASK', @processor_class.tasks, 'Select a scraping task to run (you may give this switch multiple times)', " (#{@processor_class.tasks.join(', ')})") do |v|
74
74
  options.tasks << v
75
75
  end
76
- opts.on('-o', '--output_dir PATH', 'The directory or Redis address (e.g. redis://localhost:6379) in which to dump JSON documents') do |v|
76
+ opts.on('-o', '--output_dir PATH', 'The directory or Redis address (e.g. redis://localhost:6379/0) in which to dump JSON documents') do |v|
77
77
  options.output_dir = v
78
78
  end
79
79
  opts.on('-c', '--cache_dir PATH', 'The directory or Memcached address (e.g. memcached://localhost:11211) in which to cache HTTP requests') do |v|
@@ -178,8 +178,8 @@ module Pupa
178
178
  report = {
179
179
  plan: {
180
180
  processor: @processor_class,
181
- arguments: options.dup.to_h,
182
- options: rest,
181
+ options: Marshal.load(Marshal.dump(options)).to_h,
182
+ arguments: rest,
183
183
  },
184
184
  start: Time.now.utc,
185
185
  }
data/lib/pupa/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Pupa
2
- VERSION = "0.0.9"
2
+ VERSION = "0.0.10"
3
3
  end
@@ -12,6 +12,18 @@ describe Pupa::Concerns::Contactable do
12
12
  klass.new
13
13
  end
14
14
 
15
+ describe '#initialize' do
16
+ it 'should initialize an empty ContactDetailList' do
17
+ object.contact_details.should be_a(Pupa::ContactDetailList)
18
+ object.contact_details.should == []
19
+ end
20
+
21
+ it 'should initialize the given ContactDetailList' do
22
+ object = klass.new(contact_details: [{type: 'email', value: 'ceo@example.com', note: 'work'}])
23
+ object.contact_details.should == [{type: 'email', value: 'ceo@example.com', note: 'work'}]
24
+ end
25
+ end
26
+
15
27
  describe '#contact_details=' do
16
28
  it 'should use coerce to a ContactDetailList' do
17
29
  object.contact_details = [{type: 'email', value: 'ceo@example.com', note: 'work'}]
@@ -12,6 +12,18 @@ describe Pupa::Concerns::Identifiable do
12
12
  klass.new
13
13
  end
14
14
 
15
+ describe '#initialize' do
16
+ it 'should initialize an empty IdentifierList' do
17
+ object.identifiers.should be_a(Pupa::IdentifierList)
18
+ object.identifiers.should == []
19
+ end
20
+
21
+ it 'should initialize the given IdentifierList' do
22
+ object = klass.new(identifiers: [{identifier: '123456789', scheme: 'DUNS'}])
23
+ object.identifiers.should == [{identifier: '123456789', scheme: 'DUNS'}]
24
+ end
25
+ end
26
+
15
27
  describe '#identifiers=' do
16
28
  it 'should use coerce to a IdentifierList' do
17
29
  object.identifiers = [{identifier: '123456789', scheme: 'DUNS'}]
@@ -5,7 +5,7 @@ describe Pupa::Processor::Persistence do
5
5
  Pupa.session = Moped::Session.new(['localhost:27017'], database: 'pupa_test')
6
6
  Pupa.session.collections.each(&:drop)
7
7
 
8
- Pupa::Processor::Persistence.new(Pupa::Person.new(name: 'existing')).save
8
+ Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'existing', name: 'existing')).save
9
9
 
10
10
  Pupa.session[:people].insert(_type: 'pupa/person', name: 'non-unique')
11
11
  Pupa.session[:people].insert(_type: 'pupa/person', name: 'non-unique')
@@ -27,11 +27,11 @@ describe Pupa::Processor::Persistence do
27
27
 
28
28
  describe '#save' do
29
29
  it 'should insert a document if no matches' do
30
- Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'new', name: 'new')).save.should == 'new'
30
+ Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'new', name: 'new')).save.should == [true, 'new']
31
31
  end
32
32
 
33
33
  it 'should update a document if one match' do
34
- Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'existing', name: 'existing')).save.should_not == 'existing'
34
+ Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'changed', name: 'existing')).save.should == [false, 'existing']
35
35
  end
36
36
 
37
37
  it 'should raise an error if many matches' do
@@ -67,6 +67,10 @@ describe Pupa::Processor do
67
67
  path = "/tmp/person_#{processor.person._id}.json"
68
68
  end
69
69
 
70
+ it 'should return the number of scraped objects by type' do
71
+ processor.dump_scraped_objects(:people).should == {'pupa/person' => 1}
72
+ end
73
+
70
74
  it 'should not overwrite an existing file' do
71
75
  File.open(path, 'w') {}
72
76
  expect{processor.dump_scraped_objects(:people)}.to raise_error(Pupa::Errors::DuplicateObjectIdError)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pupa
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Open North
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-09-30 00:00:00.000000000 Z
11
+ date: 2013-10-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport