RubyGems - pupa - Versions diffs - 0.0.9 → 0.0.10 - Mend

pupa 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/README.md +2 -0
data/lib/pupa/models/concerns/contactable.rb +6 -1
data/lib/pupa/models/concerns/identifiable.rb +6 -1
data/lib/pupa/models/concerns/linkable.rb +6 -1
data/lib/pupa/models/concerns/nameable.rb +6 -1
data/lib/pupa/models/concerns/sourceable.rb +6 -1
data/lib/pupa/processor/client.rb +3 -3
data/lib/pupa/processor/persistence.rb +3 -3
data/lib/pupa/processor.rb +7 -6
data/lib/pupa/refinements/json-schema.rb +18 -10
data/lib/pupa/runner.rb +3 -3
data/lib/pupa/version.rb +1 -1
data/spec/models/concerns/contactable_spec.rb +12 -0
data/spec/models/concerns/identifiable_spec.rb +12 -0
data/spec/processor/persistence_spec.rb +3 -3
data/spec/processor_spec.rb +4 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 8124ac65b9df49b337205ce22fe6491ead5a03ec
-  data.tar.gz: e28195cea41f576dea3fa58def6ccfa403d2cf8c
+  metadata.gz: 1e1915f53def27a04384e080b43b7eaa1c024398
+  data.tar.gz: 75b0c4e987b976448a816b3b36be99942c393ddd
 SHA512:
-  metadata.gz: acce9361e6ec70f4daf26bbc205724b25dddb7da5437f4fcbca6b7c5fb2df2a56f22e2f6c1cdcd63fac49c5f61c9ed052336223021c40dab522de0eecdaae562
-  data.tar.gz: 606a7695c7a0d722c43e6a574397dfd1870452f972d2d2a7147e67424e750645fdf2c9b5c18edb67dce05d03df1cfa3bc5535c8cda07ae9063a8795ddf4708cb
+  metadata.gz: 5800302c7e32286138dc9f64b8dc06cd13c07cb4cfc5478d572b8fdc10c04483d595c7f48ceddb82efe4330ce280dc3b58467be50db3fa264e1429a24b3b7e0f
+  data.tar.gz: 6cf10d376b218ce4190b4b08da3dbaa1f3fddf87763623f23368eccdccc1a305d611bb3eb8a0de37c55af961eca5ef0a88f9d613bdc08b9e59139c5fc5194b30

data/README.md CHANGED Viewed

@@ -155,6 +155,8 @@ Note that Pupa.rb flushes the Redis database before scraping. If you use Redis,
 The `json-schema` gem is slow compared to, for example, [JSV](https://github.com/garycourt/JSV). Setting the `--no-validate` switch and running JSON Schema validations separately can further reduce a scraper's running time.
+The [pupa-validate](https://npmjs.org/package/pupa-validate) npm package can be used to validate JSON documents using the faster JSV. In an example case, using JSV instead of the `json-schema` gem reduced by half the time to validate 10,000 documents.
 ### Parsing JSON
 If the rest of your scraper is fast, you may see an improvement by using the `oj` gem. Just `require 'oj'` and Pupa.rb will automatically pick it up, since it uses [MultiJson](https://github.com/intridea/multi_json).

data/lib/pupa/models/concerns/contactable.rb CHANGED Viewed

@@ -9,6 +9,11 @@ module Pupa
         dump :contact_details
       end
+      def initialize(*args)
+        @contact_details = ContactDetailList.new
+        super
+      end
       # Sets the contact details.
       #
       # @param [Array] contact_details a list of contact details
@@ -27,7 +32,7 @@ module Pupa
           data[:note] = note
         end
         if type && value
-          (@contact_details ||= ContactDetailList.new) << data
+          @contact_details << data
         end
       end
     end

data/lib/pupa/models/concerns/identifiable.rb CHANGED Viewed

@@ -9,6 +9,11 @@ module Pupa
         dump :identifiers
       end
+      def initialize(*args)
+        @identifiers = IdentifierList.new
+        super
+      end
       # Sets the identifiers.
       #
       # @param [Array] identifiers a list of identifiers
@@ -26,7 +31,7 @@ module Pupa
           data[:scheme] = scheme
         end
         if identifier
-          (@identifiers ||= []) << data
+          @identifiers << data
         end
       end
     end

data/lib/pupa/models/concerns/linkable.rb CHANGED Viewed

@@ -9,6 +9,11 @@ module Pupa
         dump :links
       end
+      def initialize(*args)
+        @links = []
+        super
+      end
       # Adds a URL.
       #
       # @param [String] url a URL
@@ -19,7 +24,7 @@ module Pupa
           data[:note] = note
         end
         if url
-          (@links ||= []) << data
+          @links << data
         end
       end
     end

data/lib/pupa/models/concerns/nameable.rb CHANGED Viewed

@@ -9,6 +9,11 @@ module Pupa
         dump :other_names
       end
+      def initialize(*args)
+        @other_names = []
+        super
+      end
       # Adds an alternate or former name.
       #
       # @param [String] name an alternate or former name
@@ -27,7 +32,7 @@ module Pupa
           data[:note] = note
         end
         if name
-          (@other_names ||= []) << data
+          @other_names << data
         end
       end
     end

data/lib/pupa/models/concerns/sourceable.rb CHANGED Viewed

@@ -9,6 +9,11 @@ module Pupa
         dump :sources
       end
+      def initialize(*args)
+        @sources = []
+        super
+      end
       # Adds a source to the object.
       #
       # @param [String] url a URL
@@ -19,7 +24,7 @@ module Pupa
           data[:note] = note
         end
         if url
-          (@sources ||= []) << data
+          @sources << data
         end
       end
     end

data/lib/pupa/processor/client.rb CHANGED Viewed

@@ -36,17 +36,17 @@ module Pupa
           connection.use Middleware::RaiseError # useful for breaking concurrent requests
           # @see http://tools.ietf.org/html/rfc4627
-          connection.use Middleware::ParseJson, content_type: /\bjson$/
+          connection.use Middleware::ParseJson, preserve_raw: true, content_type: /\bjson$/
           # @see http://tools.ietf.org/html/rfc2854
           # @see http://tools.ietf.org/html/rfc3236
           if defined?(Nokogiri)
-            connection.use Middleware::ParseHtml, content_type: %w(text/html application/xhtml+xml)
+            connection.use Middleware::ParseHtml, preserve_raw: true, content_type: %w(text/html application/xhtml+xml)
           end
           # @see http://tools.ietf.org/html/rfc3023
           if defined?(MultiXml)
-            connection.use FaradayMiddleware::ParseXml, content_type: /\bxml$/
+            connection.use FaradayMiddleware::ParseXml, preserve_raw: true, content_type: /\bxml$/
           end
           if cache_dir

data/lib/pupa/processor/persistence.rb CHANGED Viewed

@@ -30,7 +30,7 @@ module Pupa
       # Saves an object to MongoDB.
       #
-      # @return [String] the object's database ID
+      # @return [Array] whether the object was inserted and the object's database ID
       # @raises [Pupa::Errors::TooManyMatches] if multiple documents would be updated
       def save
         selector = @object.fingerprint
@@ -42,11 +42,11 @@ module Pupa
           when 0
             @object.run_callbacks(:create) do
               collection.insert(@object.to_h(persist: true))
-              @object._id.to_s
+              [true, @object._id.to_s]
             end
           when 1
             query.update(@object.to_h(persist: true))
-            query.first['_id'].to_s
+            [false, query.first['_id'].to_s]
           else
             raise Errors::TooManyMatches, "selector matches multiple documents during save: #{collection_name} #{MultiJson.dump(selector)}"
           end

data/lib/pupa/processor.rb CHANGED Viewed

@@ -116,16 +116,17 @@ module Pupa
     # Dumps scraped objects to disk.
     #
     # @param [Symbol] task_name the name of the scraping task to perform
-    # @return [Integer] the number of scraped objects
+    # @return [Hash] the number of scraped objects by type
+    # @raises [Pupa::Errors::DuplicateObjectIdError]
     def dump_scraped_objects(task_name)
-      count = 0
+      counts = Hash.new(0)
       @store.pipelined do
         send(task_name).each do |object|
-          count += 1 # we don't know the size of the enumeration
+          counts[object._type] += 1
           dump_scraped_object(object)
         end
       end
-      count
+      counts
     end
     # Saves scraped objects to a database.
@@ -371,9 +372,9 @@ module Pupa
     # @param [Object] object an object
     def import_object(object)
-      id = Persistence.new(object).save
+      inserted, id = Persistence.new(object).save
       @report[:import][object._type] ||= Hash.new(0)
-      if id == object._id
+      if inserted
         @report[:import][object._type][:insert] += 1
       else
         @report[:import][object._type][:update] += 1

data/lib/pupa/refinements/json-schema.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+require 'mail'
 module Pupa
   module Refinements
     # A refinement for JSON Schema to validate "email" and "uri" formats. Using
@@ -8,20 +10,26 @@ module Pupa
       def validate(current_schema, data, fragments, processor, validator, options = {})
         case current_schema.schema['format']
         when 'email'
-          error_message = "The property '#{build_fragment(fragments)}' must be a valid email address"
-          validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors]) and return if !data.is_a?(String)
-          address = Mail::Address.new(data)
-          unless (address.address == data && address.domain && address.__send__(:tree).domain.dot_atom_text.elements.size > 1 rescue false)
+          if String === data
+            address = Mail::Address.new(data)
+            unless (address.address == data && address.domain && address.__send__(:tree).domain.dot_atom_text.elements.size > 1 rescue false)
+              error_message = "The property '#{build_fragment(fragments)}' must be a valid email address (#{data})"
+              validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
+            end
+          else
+            error_message = "The property '#{build_fragment(fragments)}' must be a string (#{data})"
             validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
-            return
           end
         when 'uri'
-          error_message = "The property '#{build_fragment(fragments)}' must be a valid URI"
-          validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors]) and return if !data.is_a?(String)
-          r = URI::DEFAULT_PARSER.regexp[:ABS_URI]
-          unless r.match(data)
+          if String === data
+            re = URI::DEFAULT_PARSER.regexp[:ABS_URI]
+            unless re.match(data)
+              error_message = "The property '#{build_fragment(fragments)}' must be a valid URI (#{data})"
+              validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
+            end
+          else
+            error_message = "The property '#{build_fragment(fragments)}' must be string (#{data})"
             validation_error(processor, error_message, fragments, current_schema, self, options[:record_errors])
-            return
           end
         else
           super

data/lib/pupa/runner.rb CHANGED Viewed

@@ -73,7 +73,7 @@ module Pupa
         opts.on('-t', '--task TASK', @processor_class.tasks, 'Select a scraping task to run (you may give this switch multiple times)', "  (#{@processor_class.tasks.join(', ')})") do |v|
           options.tasks << v
         end
-        opts.on('-o', '--output_dir PATH', 'The directory or Redis address (e.g. redis://localhost:6379) in which to dump JSON documents') do |v|
+        opts.on('-o', '--output_dir PATH', 'The directory or Redis address (e.g. redis://localhost:6379/0) in which to dump JSON documents') do |v|
           options.output_dir = v
         end
         opts.on('-c', '--cache_dir PATH', 'The directory or Memcached address (e.g. memcached://localhost:11211) in which to cache HTTP requests') do |v|
@@ -178,8 +178,8 @@ module Pupa
       report = {
         plan: {
           processor: @processor_class,
-          arguments: options.dup.to_h,
-          options: rest,
+          options: Marshal.load(Marshal.dump(options)).to_h,
+          arguments: rest,
         },
         start: Time.now.utc,
       }

data/lib/pupa/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Pupa
-  VERSION = "0.0.9"
+  VERSION = "0.0.10"
 end

data/spec/models/concerns/contactable_spec.rb CHANGED Viewed

@@ -12,6 +12,18 @@ describe Pupa::Concerns::Contactable do
     klass.new
   end
+  describe '#initialize' do
+    it 'should initialize an empty ContactDetailList' do
+      object.contact_details.should be_a(Pupa::ContactDetailList)
+      object.contact_details.should == []
+    end
+    it 'should initialize the given ContactDetailList' do
+      object = klass.new(contact_details: [{type: 'email', value: 'ceo@example.com', note: 'work'}])
+      object.contact_details.should == [{type: 'email', value: 'ceo@example.com', note: 'work'}]
+    end
+  end
   describe '#contact_details=' do
     it 'should use coerce to a ContactDetailList' do
       object.contact_details = [{type: 'email', value: 'ceo@example.com', note: 'work'}]

data/spec/models/concerns/identifiable_spec.rb CHANGED Viewed

@@ -12,6 +12,18 @@ describe Pupa::Concerns::Identifiable do
     klass.new
   end
+  describe '#initialize' do
+    it 'should initialize an empty IdentifierList' do
+      object.identifiers.should be_a(Pupa::IdentifierList)
+      object.identifiers.should == []
+    end
+    it 'should initialize the given IdentifierList' do
+      object = klass.new(identifiers: [{identifier: '123456789', scheme: 'DUNS'}])
+      object.identifiers.should == [{identifier: '123456789', scheme: 'DUNS'}]
+    end
+  end
   describe '#identifiers=' do
     it 'should use coerce to a IdentifierList' do
       object.identifiers = [{identifier: '123456789', scheme: 'DUNS'}]

data/spec/processor/persistence_spec.rb CHANGED Viewed

@@ -5,7 +5,7 @@ describe Pupa::Processor::Persistence do
     Pupa.session = Moped::Session.new(['localhost:27017'], database: 'pupa_test')
     Pupa.session.collections.each(&:drop)
-    Pupa::Processor::Persistence.new(Pupa::Person.new(name: 'existing')).save
+    Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'existing', name: 'existing')).save
     Pupa.session[:people].insert(_type: 'pupa/person', name: 'non-unique')
     Pupa.session[:people].insert(_type: 'pupa/person', name: 'non-unique')
@@ -27,11 +27,11 @@ describe Pupa::Processor::Persistence do
   describe '#save' do
     it 'should insert a document if no matches' do
-      Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'new', name: 'new')).save.should == 'new'
+      Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'new', name: 'new')).save.should == [true, 'new']
     end
     it 'should update a document if one match' do
-      Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'existing', name: 'existing')).save.should_not == 'existing'
+      Pupa::Processor::Persistence.new(Pupa::Person.new(_id: 'changed', name: 'existing')).save.should == [false, 'existing']
     end
     it 'should raise an error if many matches' do

data/spec/processor_spec.rb CHANGED Viewed

@@ -67,6 +67,10 @@ describe Pupa::Processor do
       path = "/tmp/person_#{processor.person._id}.json"
     end
+    it 'should return the number of scraped objects by type' do
+      processor.dump_scraped_objects(:people).should == {'pupa/person' => 1}
+    end
     it 'should not overwrite an existing file' do
       File.open(path, 'w') {}
       expect{processor.dump_scraped_objects(:people)}.to raise_error(Pupa::Errors::DuplicateObjectIdError)

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pupa
 version: !ruby/object:Gem::Version
-  version: 0.0.9
+  version: 0.0.10
 platform: ruby
 authors:
 - Open North
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-09-30 00:00:00.000000000 Z
+date: 2013-10-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: activesupport