pupa 0.0.9 → 0.0.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8124ac65b9df49b337205ce22fe6491ead5a03ec
4
- data.tar.gz: e28195cea41f576dea3fa58def6ccfa403d2cf8c
3
+ metadata.gz: 1e1915f53def27a04384e080b43b7eaa1c024398
4
+ data.tar.gz: 75b0c4e987b976448a816b3b36be99942c393ddd
5
5
  SHA512:
6
- metadata.gz: acce9361e6ec70f4daf26bbc205724b25dddb7da5437f4fcbca6b7c5fb2df2a56f22e2f6c1cdcd63fac49c5f61c9ed052336223021c40dab522de0eecdaae562
7
- data.tar.gz: 606a7695c7a0d722c43e6a574397dfd1870452f972d2d2a7147e67424e750645fdf2c9b5c18edb67dce05d03df1cfa3bc5535c8cda07ae9063a8795ddf4708cb
6
+ metadata.gz: 5800302c7e32286138dc9f64b8dc06cd13c07cb4cfc5478d572b8fdc10c04483d595c7f48ceddb82efe4330ce280dc3b58467be50db3fa264e1429a24b3b7e0f
7
+ data.tar.gz: 6cf10d376b218ce4190b4b08da3dbaa1f3fddf87763623f23368eccdccc1a305d611bb3eb8a0de37c55af961eca5ef0a88f9d613bdc08b9e59139c5fc5194b30
data/README.md CHANGED
@@ -155,6 +155,8 @@ Note that Pupa.rb flushes the Redis database before scraping. If you use Redis,
155
155
 
156
156
  The `json-schema` gem is slow compared to, for example, [JSV](https://github.com/garycourt/JSV). Setting the `--no-validate` switch and running JSON Schema validations separately can further reduce a scraper's running time.
157
157
 
158
+ The [pupa-validate](https://npmjs.org/package/pupa-validate) npm package can be used to validate JSON documents using the faster JSV. In an example case, using JSV instead of the `json-schema` gem reduced by half the time to validate 10,000 documents.
159
+
158
160
  ### Parsing JSON
159
161
 
160
162
  If the rest of your scraper is fast, you may see an improvement by using the `oj` gem. Just `require 'oj'` and Pupa.rb will automatically pick it up, since it uses [MultiJson](https://github.com/intridea/multi_json).
@@ -9,6 +9,11 @@ module Pupa
9
9
  dump :contact_details
10
10
  end
11
11
 
12
+ def initialize(*args)
13
+ @contact_details = ContactDetailList.new
14
+ super
15
+ end
16
+
12
17
  # Sets the contact details.
13
18
  #
14
19
  # @param [Array] contact_details a list of contact details
@@ -27,7 +32,7 @@ module Pupa
27
32
  data[:note] = note
28
33
  end
29
34
  if type && value
30
- (@contact_details ||= ContactDetailList.new) << data
35
+ @contact_details << data
31
36
  end
32
37
  end
33
38
  end
@@ -9,6 +9,11 @@ module Pupa
9
9
  dump :identifiers
10
10
  end
11
11
 
12
+ def initialize(*args)
13
+ @identifiers = IdentifierList.new
14
+ super
15
+ end
16
+
12
17
  # Sets the identifiers.
13
18
  #
14
19
  # @param [Array] identifiers a list of identifiers
@@ -26,7 +31,7 @@ module Pupa
26
31
  data[:scheme] = scheme
27
32
  end
28
33
  if identifier
29
- (@identifiers ||= []) << data
34
+ @identifiers << data
30
35
  end
31
36
  end
32
37
  end
@@ -9,6 +9,11 @@ module Pupa
9
9
  dump :links
10
10
  end
11
11
 
12
+ def initialize(*args)
13
+ @links = []
14
+ super
15
+ end
16
+
12
17
  # Adds a URL.
13
18
  #
14
19
  # @param [String] url a URL
@@ -19,7 +24,7 @@ module Pupa
19
24
  data[:note] = note
20
25
  end
21
26
  if url
22
- (@links ||= []) << data
27
+ @links << data
23
28
  end
24
29
  end
25
30
  end
@@ -9,6 +9,11 @@ module Pupa
9
9
  dump :other_names
10
10
  end
11
11
 
12
+ def initialize(*args)
13
+ @other_names = []
14
+ super
15
+ end
16
+
12
17
  # Adds an alternate or former name.
13
18
  #
14
19
  # @param [String] name an alternate or former name
@@ -27,7 +32,7 @@ module Pupa
27
32
  data[:note] = note
28
33
  end
29
34
  if name
30
- (@other_names ||= []) << data
35
+ @other_names << data
31
36
  end
32
37
  end
33
38
  end
@@ -9,6 +9,11 @@ module Pupa
9
9
  dump :sources
10
10
  end
11
11
 
12
+ def initialize(*args)
13
+ @sources = []
14
+ super
15
+ end
16
+
12
17
  # Adds a source to the object.
13
18
  #
14
19
  # @param [String] url a URL
@@ -19,7 +24,7 @@ module Pupa
19
24
  data[:note] = note
20
25
  end
21
26
  if url
22
- (@sources ||= []) << data
27
+ @sources << data
23
28
  end
24
29
  end
25
30
  end
@@ -36,17 +36,17 @@ module Pupa
36
36
  connection.use Middleware::RaiseError # useful for breaking concurrent requests
37
37
 
38
38
  # @see http://tools.ietf.org/html/rfc4627
39
- connection.use Middleware::ParseJson, content_type: /\bjson$/
39
+ connection.use Middleware::ParseJson, preserve_raw: true, content_type: /\bjson$/
40
40
 
41
41
  # @see http://tools.ietf.org/html/rfc2854
42
42
  # @see http://tools.ietf.org/html/rfc3236
43
43
  if defined?(Nokogiri)
44
- connection.use Middleware::ParseHtml, content_type: %w(text/html application/xhtml+xml)
44
+ connection.use Middleware::ParseHtml, preserve_raw: true, content_type: %w(text/html application/xhtml+xml)
45
45
  end
46
46
 
47
47
  # @see http://tools.ietf.org/html/rfc3023
48
48
  if defined?(MultiXml)
49
- connection.use FaradayMiddleware::ParseXml, content_type: /\bxml$/
49
+ connection.use FaradayMiddleware::ParseXml, preserve_raw: true, content_type: /\bxml$/
50
50
  end
51
51
 
52
52
  if cache_dir
@@ -30,7 +30,7 @@ module Pupa
30
30
 
31
31
  # Saves an object to MongoDB.
32
32
  #
33
- # @return [String] the object's database ID
33
+ # @return [Array] whether the object was inserted and the object's database ID
34
34
  # @raises [Pupa::Errors::TooManyMatches] if multiple documents would be updated
35
35
  def save
36
36
  selector = @object.fingerprint
@@ -42,11 +42,11 @@ module Pupa
42
42
  when 0
43
43
  @object.run_callbacks(:create) do
44
44
  collection.insert(@object.to_h(persist: true))
45
- @object._id.to_s
45
+ [true, @object._id.to_s]
46
46
  end
47
47
  when 1
48
48
  query.update(@object.to_h(persist: true))
49
- query.first['_id'].to_s
49
+ [false, query.first['_id'].to_s]
50
50
  else
51
51
  raise Errors::TooManyMatches, "selector matches multiple documents during save: #{collection_name} #{MultiJson.dump(selector)}"
52
52
  end
@@ -116,16 +116,17 @@ module Pupa
116
116
  # Dumps scraped objects to disk.
117
117
  #
118
118
  # @param [Symbol] task_name the name of the scraping task to perform
119
- # @return [Integer] the number of scraped objects
119
+ # @return [Hash] the number of scraped objects by type
120
+ # @raises [Pupa::Errors::DuplicateObjectIdError]
120
121
  def dump_scraped_objects(task_name)
121
- count = 0
122
+ counts = Hash.new(0)
122
123
  @store.pipelined do
123
124
  send(task_name).each do |object|
124
- count += 1 # we don't know the size of the enumeration
125
+ counts[object._type] += 1
125
126
  dump_scraped_object(object)
126
127
  end
127
128
  end
128
- count
129
+ counts
129
130
  end
130
131
 
131
132
  # Saves scraped objects to a database.
@@ -371,9 +372,9 @@ module Pupa
371
372
 
372
373
  # @param [Object] object an object
373
374
  def import_object(object)
374
- id = Persistence.new(object).save
375
+ inserted, id = Persistence.new(object).save
375
376
  @report[:import][object._type] ||= Hash.new(0)
376
- if id == object._id
377
+ if inserted
377
378
  @report[:import][object._type][:insert] += 1
378
379
  else
379
380
  @report[:import][object._type][:update] += 1
@@ -1,3 +1,5 @@
1
+ require 'mail'
2
+
1
3
  module Pupa
2
4
  module Refinements
3
5
  # A refinement for JSON Schema to validate "email" and "uri" formats. Using
@@ -8,20 +10,26 @@ module Pupa
8
10
  def validate(current_schema, data, fragments, processor, validator, options = {})
9
11
  case current_schema.schema['format']
10
12
  when 'email'
11
- error_message = "The property '#{build_fragment(fragments)}' must be a valid email address"
12
- validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors]) and return if !data.is_a?(String)
13
- address = Mail::Address.new(data)
14
- unless (address.address == data && address.domain && address.__send__(:tree).domain.dot_atom_text.elements.size > 1 rescue false)
13
+ if String === data
14
+ address = Mail::Address.new(data)
15
+ unless (address.address == data && address.domain && address.__send__(:tree).domain.dot_atom_text.elements.size > 1 rescue false)
16
+ error_message = "The property '#{build_fragment(fragments)}' must be a valid email address (#{data})"
17
+ validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
18
+ end
19
+ else
20
+ error_message = "The property '#{build_fragment(fragments)}' must be a string (#{data})"
15
21
  validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
16
- return
17
22
  end
18
23
  when 'uri'
19
- error_message = "The property '#{build_fragment(fragments)}' must be a valid URI"
20
- validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors]) and return if !data.is_a?(String)
21
- r = URI::DEFAULT_PARSER.regexp[:ABS_URI]
22
- unless r.match(data)
24
+ if String === data
25
+ re = URI::DEFAULT_PARSER.regexp[:ABS_URI]
26
+ unless re.match(data)
27
+ error_message = "The property '#{build_fragment(fragments)}' must be a valid URI (#{data})"
28
+ validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
29
+ end
30
+ else
31
+ error_message = "The property '#{build_fragment(fragments)}' must be string (#{data})"
23
32
  validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
24
- return
25
33
  end
26
34
  else
27
35
  super
data/lib/pupa/runner.rb CHANGED
@@ -73,7 +73,7 @@ module Pupa
73
73
  opts.on('-t', '--task TASK', @processor_class.tasks, 'Select a scraping task to run (you may give this switch multiple times)', " (#{@processor_class.tasks.join(', ')})") do |v|
74
74
  options.tasks << v
75
75
  end
76
- opts.on('-o', '--output_dir PATH', 'The directory or Redis address (e.g. redis://localhost:6379) in which to dump JSON documents') do |v|
76
+ opts.on('-o', '--output_dir PATH', 'The directory or Redis address (e.g. redis://localhost:6379/0) in which to dump JSON documents') do |v|
77
77
  options.output_dir = v
78
78
  end
79
79
  opts.on('-c', '--cache_dir PATH', 'The directory or Memcached address (e.g. memcached://localhost:11211) in which to cache HTTP requests') do |v|
@@ -178,8 +178,8 @@ module Pupa
178
178
  report = {
179
179
  plan: {
180
180
  processor: @processor_class,
181
- arguments: options.dup.to_h,
182
- options: rest,
181
+ options: Marshal.load(Marshal.dump(options)).to_h,
182
+ arguments: rest,
183
183
  },
184
184
  start: Time.now.utc,
185
185
  }
data/lib/pupa/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Pupa
2
- VERSION = "0.0.9"
2
+ VERSION = "0.0.10"
3
3
  end
@@ -12,6 +12,18 @@ describe Pupa::Concerns::Contactable do
12
12
  klass.new
13
13
  end
14
14
 
15
+ describe '#initialize' do
16
+ it 'should initialize an empty ContactDetailList' do
17
+ object.contact_details.should be_a(Pupa::ContactDetailList)
18
+ object.contact_details.should == []
19
+ end
20
+
21
+ it 'should initialize the given ContactDetailList' do
22
+ object = klass.new(contact_details: [{type: 'email', value: 'ceo@example.com', note: 'work'}])
23
+ object.contact_details.should == [{type: 'email', value: 'ceo@example.com', note: 'work'}]
24
+ end
25
+ end
26
+
15
27
  describe '#contact_details=' do
16
28
  it 'should use coerce to a ContactDetailList' do
17
29
  object.contact_details = [{type: 'email', value: 'ceo@example.com', note: 'work'}]
@@ -12,6 +12,18 @@ describe Pupa::Concerns::Identifiable do
12
12
  klass.new
13
13
  end
14
14
 
15
+ describe '#initialize' do
16
+ it 'should initialize an empty IdentifierList' do
17
+ object.identifiers.should be_a(Pupa::IdentifierList)
18
+ object.identifiers.should == []
19
+ end
20
+
21
+ it 'should initialize the given IdentifierList' do
22
+ object = klass.new(identifiers: [{identifier: '123456789', scheme: 'DUNS'}])
23
+ object.identifiers.should == [{identifier: '123456789', scheme: 'DUNS'}]
24
+ end
25
+ end
26
+
15
27
  describe '#identifiers=' do
16
28
  it 'should use coerce to a IdentifierList' do
17
29
  object.identifiers = [{identifier: '123456789', scheme: 'DUNS'}]
@@ -5,7 +5,7 @@ describe Pupa::Processor::Persistence do
5
5
  Pupa.session = Moped::Session.new(['localhost:27017'], database: 'pupa_test')
6
6
  Pupa.session.collections.each(&:drop)
7
7
 
8
- Pupa::Processor::Persistence.new(Pupa::Person.new(name: 'existing')).save
8
+ Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'existing', name: 'existing')).save
9
9
 
10
10
  Pupa.session[:people].insert(_type: 'pupa/person', name: 'non-unique')
11
11
  Pupa.session[:people].insert(_type: 'pupa/person', name: 'non-unique')
@@ -27,11 +27,11 @@ describe Pupa::Processor::Persistence do
27
27
 
28
28
  describe '#save' do
29
29
  it 'should insert a document if no matches' do
30
- Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'new', name: 'new')).save.should == 'new'
30
+ Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'new', name: 'new')).save.should == [true, 'new']
31
31
  end
32
32
 
33
33
  it 'should update a document if one match' do
34
- Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'existing', name: 'existing')).save.should_not == 'existing'
34
+ Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'changed', name: 'existing')).save.should == [false, 'existing']
35
35
  end
36
36
 
37
37
  it 'should raise an error if many matches' do
@@ -67,6 +67,10 @@ describe Pupa::Processor do
67
67
  path = "/tmp/person_#{processor.person._id}.json"
68
68
  end
69
69
 
70
+ it 'should return the number of scraped objects by type' do
71
+ processor.dump_scraped_objects(:people).should == {'pupa/person' => 1}
72
+ end
73
+
70
74
  it 'should not overwrite an existing file' do
71
75
  File.open(path, 'w') {}
72
76
  expect{processor.dump_scraped_objects(:people)}.to raise_error(Pupa::Errors::DuplicateObjectIdError)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pupa
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Open North
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-09-30 00:00:00.000000000 Z
11
+ date: 2013-10-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport