RubyGems - scruber-mongo - Versions diffs - 0.1.0 → 0.1.1 - Mend

scruber-mongo 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/.gitignore +2 -1
data/Gemfile +1 -1
data/Gemfile.lock +19 -13
data/README.md +26 -14
data/lib/scruber/core/extensions/mongo_output.rb +89 -3
data/lib/scruber/mongo/cli/generators.rb +1 -0
data/lib/scruber/mongo/cli/templates/mongo_initializer.tt +3 -0
data/lib/scruber/mongo/version.rb +1 -1
data/lib/scruber/queue_adapters/mongo.rb +126 -21
data/scruber-mongo.gemspec +2 -2
metadata +11 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: bcedbbaa8337e7002b6bbcfc8f5c87394fd534d3
-  data.tar.gz: 7e95834b4b2b1a39efed8fc984401db78ef26bbb
+  metadata.gz: 5c298bffe030c11d719251f76cd10f7e12c18c39
+  data.tar.gz: 458c6f63660ef791c504480cd4b6b640b5cca9f9
 SHA512:
-  metadata.gz: e3ed0f1aa68d4a720527fc35a6e444f77da821ab234f088bb16aaad1af66c6c8543a1bde1d7bc08541d176d2f7ee1b3fdee8a51ca04ee99653853d5622b4ca8b
-  data.tar.gz: 6372a6bb978b68e8bc8d748a22f2ac20dbdcd5c436ddd4c5b2f65a5f8c9814dd9fd9d6f6072f24bfa65d06ad7d64939655b5aac80dee68302aee8c283d9f3347
+  metadata.gz: acfc91d30bf3f9fb48b344a3b9ad904c9f46eff2411675538fa46ab7de544ea6864ff20dec23b73afcaffb255e4f6c458925b4f36aaa9553c7f079aea2dc4da8
+  data.tar.gz: d4e10f9ffd106090beef149dd346785cc40ec6b8e3f2133a53a773f586f1f8d91b34c54ee373acdf9fd34dbff611d5e8e385eaf14cccf0ac489898b47ebd649f

data/.gitignore CHANGED Viewed

@@ -12,4 +12,5 @@
 .rspec_status
 .ruby-version
 .ruby-gemset
-*.gem
+*.gem
+todo

data/Gemfile CHANGED Viewed

@@ -3,4 +3,4 @@ source "https://rubygems.org"
 git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
 # Specify your gem's dependencies in scruber-mongo.gemspec
-gemspec
+gemspec

data/Gemfile.lock CHANGED Viewed

@@ -1,16 +1,16 @@
 PATH
   remote: .
   specs:
-    scruber-mongo (0.1.0)
+    scruber-mongo (0.1.1)
       mongo (~> 2.4)
-      scruber (~> 0.1.3)
+      scruber (~> 0.1.6)
 GEM
   remote: https://rubygems.org/
   specs:
-    activesupport (5.1.5)
+    activesupport (5.2.0)
       concurrent-ruby (~> 1.0, >= 1.0.2)
-      i18n (~> 0.7)
+      i18n (>= 0.7, < 2)
       minitest (~> 5.1)
       tzinfo (~> 1.1)
     addressable (2.5.2)
@@ -21,15 +21,16 @@ GEM
       safe_yaml (~> 1.0.0)
     database_cleaner (1.6.2)
     diff-lcs (1.3)
-    domain_name (0.5.20170404)
+    domain_name (0.5.20180417)
       unf (>= 0.0.5, < 1.0.0)
     ethon (0.11.0)
       ffi (>= 1.3.0)
     ffi (1.9.23)
     hashdiff (0.3.7)
+    hashie (3.5.7)
     http-cookie (1.0.3)
       domain_name (~> 0.5)
-    i18n (0.9.5)
+    i18n (1.0.1)
       concurrent-ruby (~> 1.0)
     mini_portile2 (2.3.0)
     minitest (5.11.3)
@@ -37,7 +38,10 @@ GEM
       bson (>= 4.3.0, < 5.0.0)
     nokogiri (1.8.2)
       mini_portile2 (~> 2.3.0)
+    paint (2.0.1)
     pickup (0.0.11)
+    powerbar (2.0.1)
+      hashie (>= 1.1.0)
     public_suffix (3.0.2)
     rake (10.5.0)
     rspec (3.7.0)
@@ -54,16 +58,18 @@ GEM
       rspec-support (~> 3.7.0)
     rspec-support (3.7.1)
     safe_yaml (1.0.4)
-    scruber (0.1.3)
-      activesupport (= 5.1.5)
+    scruber (0.1.6)
+      activesupport (~> 5.1, >= 5.1.5)
       http-cookie (= 1.0.3)
-      nokogiri (= 1.8.2)
-      pickup (= 0.0.11)
+      nokogiri (~> 1.8, >= 1.8.2)
+      paint (~> 2.0, >= 2.0.1)
+      pickup (~> 0.0.11)
+      powerbar (~> 2.0, >= 2.0.1)
       thor (= 0.20.0)
-      typhoeus (= 1.1.2)
+      typhoeus (~> 1.1, >= 1.1.2)
     thor (0.20.0)
     thread_safe (0.3.6)
-    typhoeus (1.1.2)
+    typhoeus (1.3.0)
       ethon (>= 0.9.0)
     tzinfo (1.2.5)
       thread_safe (~> 0.1)
@@ -80,7 +86,7 @@ PLATFORMS
 DEPENDENCIES
   bundler (~> 1.16)
-  database_cleaner (~> 1.6.0)
+  database_cleaner (~> 1.6, >= 1.6.0)
   rake (~> 10.0)
   rspec (~> 3.0)
   scruber-mongo!

data/README.md CHANGED Viewed

@@ -1,38 +1,50 @@
-# Scruber::Mongo
+# Scruber-mongo
-Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/scruber/mongo`. To experiment with that code, run `bin/console` for an interactive prompt.
-TODO: Delete this and the text above, and describe your gem
+This gem provides Mongo support for Scruber
 ## Installation
-Add this line to your application's Gemfile:
+1. Add this line to your application's Gemfile:
 ```ruby
 gem 'scruber-mongo'
 ```
-And then execute:
+2. And then execute:
     $ bundle
-Or install it yourself as:
+3. Install gem
+    $ scruber generate mongo:install
-    $ gem install scruber-mongo
+This gem provides Queue driver, Output driver and FetcherAgent driver for mongo.
-## Usage
+## Sample scraper
-TODO: Write usage instructions here
+```ruby
+Scruber.run do
+  get "http://example.abc/product"
+  parse :html do |page, doc|
+    id = mongo_out_product title: doc.at('title').text
-## Development
+    get_reviews URI.join(page.url, doc.at('a.review_link').attr('href')).to_s, product_id: id
+  end
-After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
+  parse_reviews :html do |page,doc|
+    product = mongo_find_product page.options[:product_id]
-To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+    product[:reviews] = doc.search('.review').map{|r| {author: r.at('.author').text, text: r.at('.text').text } }
+    mongo_out_product product
+  end
+end
+```
 ## Contributing
-Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/scruber-mongo.
+Bug reports and pull requests are welcome on GitHub at https://github.com/scruber/scruber-mongo.
 ## License

data/lib/scruber/core/extensions/mongo_output.rb CHANGED Viewed

@@ -1,17 +1,67 @@
 module Scruber
   module Core
     module Extensions
+      #
+      # Extension for writing results to mongo collections.
+      # It registers methods for writing documents:
+      #     mongo_out({..}) # writing document to {prefix}_{scraper_name}_records
+      #     mongo_out_product({..}) # writing document to {prefix}_{scraper_name}_product
+      # Searching methods:
+      #     mongo_find({..}) # searching document in {prefix}_{scraper_name}_records
+      #     mongo_find_product({..}) # searching document in {prefix}_{scraper_name}_product
+      # Accessing to mongo collection:
+      #     mongo_collection({..}) # Direct access to {prefix}_{scraper_name}_records
+      #     mongo_product_collection({..}) # Direct access to {prefix}_{scraper_name}_product
+      #
+      # @example Writing products data and companies
+      #   Scruber.run :simple do
+      #     get_product 'http://example.com/product'
+      #     get_company 'http://example.com/product'
+      #
+      #     parse_product :html do |page,doc|
+      #       id = mongo_out_product {title: doc.at('h1').text, price: doc.at('.price').text }
+      #       record = mongo_find_product id
+      #       record[:description] = doc.at('.desc').text
+      #       mongo_out_product record
+      #       log "Count: #{mongo_product_collection.count}"
+      #     end
+      #
+      #     parse_company :html do |page,doc|
+      #       mongo_out_company {name: doc.at('h1').text, phone: doc.at('.phone').text }
+      #     end
+      #   end
+      #
+      # @author Ivan Goncharov
+      #
       class MongoOutput < Base
         module CoreMethods
+          #
+          # Mongo out default method. By default it uses suffix *_records*
+          #
+          # @param fields [Hash] Fields to output
+          # @param options [Hash] Output options, see https://docs.mongodb.com/manual/reference/method/db.collection.findOneAndUpdate/
+          #
+          # @return [Object] id of writed record
           def mongo_out(fields, options={})
             Scruber::Core::Extensions::MongoOutput.mongo_out self.scraper_name, Scruber::Core::Extensions::MongoOutput.default_suffix_name, fields, options
           end
+          #
+          # Find mongo document by id
+          #
+          # @param id [Object] id of document
+          #
+          # @return [Hash] mongo document
           def mongo_find(id)
             Scruber::Core::Extensions::MongoOutput.mongo_find self.scraper_name, Scruber::Core::Extensions::MongoOutput.default_suffix_name, id
           end
+          #
+          # Direct access to mongo collection
+          #
+          # @return [Mongo::Collection] Mongo collection instance
           def mongo_collection
             Scruber::Core::Extensions::MongoOutput.mongo_collection self.scraper_name, Scruber::Core::Extensions::MongoOutput.default_suffix_name
           end
@@ -51,25 +101,47 @@ module Scruber
         end
         class << self
+          # Default mongo collection suffix name
           attr_writer :default_suffix_name
+          #
+          # Default mongo collection suffix name
+          #
+          # @return [String] Default mongo collection suffix name
           def default_suffix_name
             @default_suffix_name ||= 'records'
           end
+          #
+          # Writing results to mongo collection
+          #
+          # @param scraper_name [String] name of scraper to build collection name
+          # @param suffix [String] suffix to build collection name
+          # @param fields [Hash] Document to output
+          # @param options [Hash] Options for updating record (when *_id* not set), see https://docs.mongodb.com/manual/reference/method/db.collection.findOneAndUpdate/
+          #
+          # @return [type] [description]
           def mongo_out(scraper_name, suffix, fields, options={})
             fields = fields.with_indifferent_access
             if fields[:_id].blank?
-              Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].insert_one(fields)
+              Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].insert_one(fields).inserted_id
             else
               Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].find_one_and_update(
                 {"_id" => fields[:_id] },
                 {'$set' => fields },
-                {return_document: :before, upsert: true}.merge(options)
-              )
+                {return_document: :after, upsert: true}.merge(options)
+              )[:_id]
             end
           end
+          #
+          # Searching document in mongo
+          #
+          # @param scraper_name [String] name of scraper to build collection name
+          # @param suffix [String] suffix to build collection name
+          # @param id [Object] id of document
+          #
+          # @return [Hash] document
           def mongo_find(scraper_name, suffix, id)
             if id.is_a?(Hash)
               Scruber::Mongo.client[out_collection_name(scraper_name, suffix)].find(id)
@@ -78,10 +150,24 @@ module Scruber
             end
           end
+          #
+          # Access to mongo collection
+          #
+          # @param scraper_name [String] name of scraper to build collection name
+          # @param suffix [String] suffix to build collection name
+          #
+          # @return [Mongo::Collection] instance of Mongo::Collection
           def mongo_collection(scraper_name, suffix)
             Scruber::Mongo.client[out_collection_name(scraper_name, suffix)]
           end
+          #
+          # Collection name builder
+          #
+          # @param scraper_name [String] name of scraper to build collection name
+          # @param suffix [String] suffix to build collection name
+          #
+          # @return [String] name of collection for given scraper_name and suffix
           def out_collection_name(scraper_name, suffix)
             [Scruber::Mongo.configuration.options['collections_prefix'], scraper_name, suffix].select(&:present?).map(&:to_s).join('_')
           end

data/lib/scruber/mongo/cli/generators.rb CHANGED Viewed

@@ -18,6 +18,7 @@ module Scruber
         def create_files
           template 'mongo.tt', File.expand_path('../../config/mongo.yml', APP_PATH)
+          template 'mongo_initializer.tt', File.expand_path('../../config/initializers/mongo.rb', APP_PATH)
         end
         def change_config

data/lib/scruber/mongo/cli/templates/mongo_initializer.tt ADDED Viewed

@@ -0,0 +1,3 @@
+Mongo::Logger.level = 1
+Scruber::Mongo.configuration.load! File.expand_path(File.join(File.dirname(__FILE__), '..', 'mongo.yml'))

data/lib/scruber/mongo/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Scruber
   module Mongo
-    VERSION = "0.1.0"
+    VERSION = "0.1.1"
   end
 end

data/lib/scruber/queue_adapters/mongo.rb CHANGED Viewed

@@ -1,56 +1,114 @@
 module Scruber
   module QueueAdapters
     class Mongo < AbstractAdapter
-      attr_reader :error_pages
       class Page < Scruber::QueueAdapters::AbstractAdapter::Page
         def id
-          @options[:_id] || @options[:id]
+          @options[:_id] || @id
         end
-        def save(options={})
+        #
+        # Saving page to queue
+        # @param options [Hash] saving options
+        # @param save_options={} [type] [description]
+        #
+        # @return [type] [description]
+        def save(options={}, save_options={})
           if id.blank?
             @queue.collection.insert_one(attrs)
           else
-            @queue.collection.find_one_and_update(
-              {"_id" => self.id },
-              {'$set' => attrs },
-              {return_document: :before, upsert: true, projection: {_id: 1}}.merge(options)
-            )
+            if options[:new]
+              @queue.collection.find_one_and_update(
+                {"_id" => self.id },
+                {'$setOnInsert' => attrs },
+                {return_document: :after, upsert: true, projection: {_id: 1}}.merge(options)
+              )
+            else
+              @queue.collection.find_one_and_update(
+                {"_id" => self.id },
+                {'$set' => attrs },
+                {return_document: :after, upsert: true, projection: {_id: 1}}.merge(options)
+              )
+            end
           end
         end
+        #
+        # Mark page as processed by parser and save it
+        #
+        # @return [void]
+        def processed!
+          # Monkey patch for processing error pages.
+          if @fetched_at == 0
+            @fetched_at = -1
+          end
+          super
+        end
+        #
+        # Generating hash with mongo doc attributes
+        #
+        # @return [Hash] hash with page attributes
         def attrs
           @options.with_indifferent_access.except('id', '_id').merge(id.present? ? {_id: id} : {}).merge (instance_variables.select{|ivar| !(ivar.to_s =~ /\@_/) }-[:@options, :@queue]).inject({}){|acc,ivar| acc[ivar[1..-1]] = instance_variable_get(ivar);acc }.with_indifferent_access
         end
+        #
+        # Delete record from Mongo collection
+        #
+        # @return [void]
         def delete
           @queue.collection.find({"_id" => self.id }).delete_one if self.id.present?
         end
       end
-      # def initialize(options={})
-      #   super(options)
-      # end
-      def push(url_or_page, options={})
+      #
+      # Add page to queue
+      # @param url [String] URL of page
+      # @param options [Hash] Other options, see {Scruber::QueueAdapters::AbstractAdapter::Page}
+      #
+      # @return [void]
+      def add(url_or_page, options={})
         if url_or_page.is_a?(Page)
           url_or_page.queue = self
-          url_or_page.save(options)
+          url_or_page.save({new: true}.merge(options))
         else
-          Page.new(self, url_or_page, options).save
+          Page.new(self, options.merge(url: url_or_page)).save({new: true})
         end
       end
-      alias_method :add, :push
+      alias_method :push, :add
-      def queue_size
+      #
+      # Size of queue
+      #
+      # @return [Integer] count of pages in queue
+      def size
         collection.count
       end
+      #
+      # Count of downloaded pages
+      # Using to show downloading progress.
+      #
+      # @return [Integer] count of downloaded pages
+      def downloaded_count
+        collection.find({fetched_at: {"$gt" => 0}}).count
+      end
+      #
+      # Search page by id
+      # @param id [Object] id of page
+      #
+      # @return [Page] page object
       def find(id)
         build_pages collection.find({_id: id}).first
       end
+      #
+      # Fetch downloaded and not processed pages for feching
+      # @param count=nil [Integer] count of pages to fetch
+      #
+      # @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
       def fetch_downloaded(count=nil)
         if count.nil?
           build_pages collection.find({fetched_at: {"$gt" => 0}, processed_at: 0}).first
@@ -59,34 +117,81 @@ module Scruber
         end
       end
+      #
+      # Fetch pending page for fetching
+      # @param count=nil [Integer] count of pages to fetch
+      #
+      # @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
       def fetch_pending(count=nil)
         if count.nil?
-          build_pages collection.find({fetched_at: 0, retry_at: {"$lte" => Time.now.to_i}}).first
+          build_pages collection.find({fetched_at: 0, retry_count: {"$lt" => ::Scruber.configuration.fetcher_options[:max_retry_times]}, retry_at: {"$lte" => Time.now.to_i}}).first
         else
-          build_pages collection.find({fetched_at: 0, retry_at: {"$lte" => Time.now.to_i}}).limit(count).to_a
+          build_pages collection.find({fetched_at: 0, retry_count: {"$lt" => ::Scruber.configuration.fetcher_options[:max_retry_times]}, retry_at: {"$lte" => Time.now.to_i}}).limit(count).to_a
         end
       end
+      #
+      # Fetch error page
+      # @param count=nil [Integer] count of pages to fetch
+      #
+      # @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
+      def fetch_error(count=nil)
+        if count.nil?
+          build_pages collection.find({fetched_at: 0, retry_count: {"$gte" => ::Scruber.configuration.fetcher_options[:max_retry_times]}}).first
+        else
+          build_pages collection.find({fetched_at: 0, retry_count: {"$gte" => ::Scruber.configuration.fetcher_options[:max_retry_times]}}).limit(count).to_a
+        end
+      end
+      #
+      # Used by Core. It checks for pages that are
+      # not downloaded or not parsed yet.
+      #
+      # @return [Boolean] true if queue still has work for scraper
       def has_work?
         fetch_pending.present? || fetch_downloaded.present?
       end
+      #
+      # Accessing to mongo collection instance
+      #
+      # @return [Mongo::Collection] Mongo collection instance
       def collection
         Scruber::Mongo.client[pages_collection_name]
       end
+      #
+      # Check if queue was initialized.
+      # Using for `seed` method. If queue was initialized,
+      # then no need to run seed block.
+      #
+      # @return [Boolean] true if queue already was initialized
+      def initialized?
+        Scruber::Mongo.client[pages_collection_name].find.first.present?
+      end
       private
+        #
+        # Wrapping mongo objects into queue Page objects
+        #
+        # @param pages [Hash|Array<Hash>] Mongo document or array of mongo documents
+        #
+        # @return [type] [description]
         def build_pages(pages)
           if pages.nil?
             nil
           elsif pages.is_a?(Array)
-            pages.map{|p| Page.new(self, p['url'], p.with_indifferent_access )}
+            pages.map{|p| Page.new(self, p.with_indifferent_access.merge(url: p['url']) )}
           else
-            Page.new(self, pages['url'], pages.with_indifferent_access )
+            Page.new(self, pages.with_indifferent_access.merge(url: pages['url']) )
           end
         end
+        #
+        # Generating mongo pages collection name
+        #
+        # @return [String] name of pages collection
         def pages_collection_name
           @_pages_collection_name ||= [Scruber::Mongo.configuration.options['collections_prefix'], @options[:scraper_name], 'pages'].select(&:present?).map(&:to_s).join('_')
         end

data/scruber-mongo.gemspec CHANGED Viewed

@@ -30,11 +30,11 @@ Gem::Specification.new do |spec|
   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
   spec.require_paths = ["lib"]
-  spec.add_dependency "scruber", "~> 0.1.3"
+  spec.add_dependency "scruber", "~> 0.1.6"
   spec.add_dependency "mongo", "~> 2.4"
   spec.add_development_dependency "bundler", "~> 1.16"
   spec.add_development_dependency "rake", "~> 10.0"
   spec.add_development_dependency "rspec", "~> 3.0"
-  spec.add_development_dependency "database_cleaner", "~> 1.6.0"
+  spec.add_development_dependency "database_cleaner", '~> 1.6', '>= 1.6.0'
   spec.add_development_dependency "webmock", "3.0.1"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: scruber-mongo
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
 platform: ruby
 authors:
 - Ivan Goncharov
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-03-17 00:00:00.000000000 Z
+date: 2018-04-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: scruber
@@ -16,14 +16,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.3
+        version: 0.1.6
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.3
+        version: 0.1.6
 - !ruby/object:Gem::Dependency
   name: mongo
   requirement: !ruby/object:Gem::Requirement
@@ -85,6 +85,9 @@ dependencies:
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.6'
+    - - ">="
       - !ruby/object:Gem::Version
         version: 1.6.0
   type: :development
@@ -92,6 +95,9 @@ dependencies:
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.6'
+    - - ">="
       - !ruby/object:Gem::Version
         version: 1.6.0
 - !ruby/object:Gem::Dependency
@@ -130,6 +136,7 @@ files:
 - lib/scruber/mongo.rb
 - lib/scruber/mongo/cli/generators.rb
 - lib/scruber/mongo/cli/templates/mongo.tt
+- lib/scruber/mongo/cli/templates/mongo_initializer.tt
 - lib/scruber/mongo/configuration.rb
 - lib/scruber/mongo/factory.rb
 - lib/scruber/mongo/version.rb