RubyGems - gcrawler - Versions diffs - 0.1.0 → 0.1.2 - Mend

gcrawler 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3c1b8fc3cdd83389fd819bb922e5a40c5b6d0e31efad64d44643e686dacb6903
-  data.tar.gz: a333883b2912929e8b19d46f06e1ad3ee2cf7d455b9925cc8cce22ce6d7a30dd
+  metadata.gz: b6e556eb21077d9c958169dcc23e580d82fb0d350bd6262fb3e0b2765d83f31c
+  data.tar.gz: 17ebf8852c1a5543a85ee29a45db2bc4eb87c1ea8ed12f3d56c190ea10107703
 SHA512:
-  metadata.gz: 45a235679f7d963eee03ddb87d8cb76bc8fd81218016443c57b95b2aae737678095eec3d6b1a1c48b281f2b29339a4048dba3c8cc702b13afeb9bb6a750befb4
-  data.tar.gz: 3fee1f2b977bc448bee35d96736017b171f7ad4ffd4595e2e1fa488e33d914bf50341206c885363690e5be3a454f5c56207298877faac89436e9e0f0a5cfc5d7
+  metadata.gz: 63083b51697eb73760ddb29a97ce7770f5404263751f6213471955e571c1e2a800fe92d2d6bc79c1d857c085eae1eecfe8cd82baa6864b3526e5b3581d9009ef
+  data.tar.gz: 0b0a35efabdf3f69a8f6f288586561b7124d2b2440b405840aff0a57ac78c833c9ca37129443eb7a92b37ab43d9e2508105b2522a93872f6c6c28884d3ea4d93

data/.github/workflows/ruby.yml ADDED Viewed

@@ -0,0 +1,38 @@
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+# This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake
+# For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby
+name: Ruby
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+permissions:
+  contents: read
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        ruby-version: ['2.7', '3.0']
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Ruby
+    # To automatically get bug fixes and new Ruby versions for ruby/setup-ruby,
+    # change this to (see https://github.com/ruby/setup-ruby#versioning):
+    # uses: ruby/setup-ruby@v1
+      uses: ruby/setup-ruby@0a29871fe2b0200a17a4497bae54fe5df0d973aa # v1.115.3
+      with:
+        ruby-version: ${{ matrix.ruby-version }}
+        bundler-cache: true # runs 'bundle install' and caches installed gems automatically
+    - name: Run tests
+      run: COVERALLS_REPO_TOKEN=${{ secrets.REPO_TOKEN }} bundle exec rake

data/.gitignore CHANGED Viewed

@@ -9,3 +9,4 @@
 # rspec failure tracking
 .rspec_status
+.coveralls.yml

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,9 @@
+### version 0.1.2 (2022-09-27)
+* Add pause argument for search functions
+### version 0.1.1 (2022-09-24)
+* Add logger class for stdout
+* Add test specs

data/Gemfile CHANGED Viewed

@@ -1,7 +1,10 @@
-source "https://rubygems.org"
+source 'https://rubygems.org'
 # Specify your gem's dependencies in gcrawler.gemspec
 gemspec
-gem "rake", "~> 12.0"
-gem "rspec", "~> 3.0"
+gem 'coveralls_reborn', require: false
+gem 'rake', '~> 12.0'
+gem 'rspec', '~> 3.0'
+gem 'wombat'

data/Gemfile.lock CHANGED Viewed

@@ -1,13 +1,65 @@
 PATH
   remote: .
   specs:
-    gcrawler (0.1.0)
+    gcrawler (0.1.2)
 GEM
   remote: https://rubygems.org/
   specs:
+    activesupport (7.0.4)
+      concurrent-ruby (~> 1.0, >= 1.0.2)
+      i18n (>= 1.6, < 2)
+      minitest (>= 5.1)
+      tzinfo (~> 2.0)
+    addressable (2.8.1)
+      public_suffix (>= 2.0.2, < 6.0)
+    concurrent-ruby (1.1.10)
+    connection_pool (2.3.0)
+    coveralls_reborn (0.25.0)
+      simplecov (>= 0.18.1, < 0.22.0)
+      term-ansicolor (~> 1.6)
+      thor (>= 0.20.3, < 2.0)
+      tins (~> 1.16)
     diff-lcs (1.5.0)
+    docile (1.4.0)
+    domain_name (0.5.20190701)
+      unf (>= 0.0.5, < 1.0.0)
+    http-accept (1.7.0)
+    http-cookie (1.0.5)
+      domain_name (~> 0.5)
+    i18n (1.12.0)
+      concurrent-ruby (~> 1.0)
+    mechanize (2.8.5)
+      addressable (~> 2.8)
+      domain_name (~> 0.5, >= 0.5.20190701)
+      http-cookie (~> 1.0, >= 1.0.3)
+      mime-types (~> 3.0)
+      net-http-digest_auth (~> 1.4, >= 1.4.1)
+      net-http-persistent (>= 2.5.2, < 5.0.dev)
+      nokogiri (~> 1.11, >= 1.11.2)
+      rubyntlm (~> 0.6, >= 0.6.3)
+      webrick (~> 1.7)
+      webrobots (~> 0.1.2)
+    mime-types (3.4.1)
+      mime-types-data (~> 3.2015)
+    mime-types-data (3.2022.0105)
+    mini_portile2 (2.8.0)
+    minitest (5.16.3)
+    net-http-digest_auth (1.4.1)
+    net-http-persistent (4.0.1)
+      connection_pool (~> 2.2)
+    netrc (0.11.0)
+    nokogiri (1.13.8)
+      mini_portile2 (~> 2.8.0)
+      racc (~> 1.4)
+    public_suffix (5.0.0)
+    racc (1.6.0)
     rake (12.3.3)
+    rest-client (2.1.0)
+      http-accept (>= 1.7.0, < 2.0)
+      http-cookie (>= 1.0.2, < 2.0)
+      mime-types (>= 1.16, < 4.0)
+      netrc (~> 0.8)
     rspec (3.11.0)
       rspec-core (~> 3.11.0)
       rspec-expectations (~> 3.11.0)
@@ -21,14 +73,40 @@ GEM
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.11.0)
     rspec-support (3.11.1)
+    rubyntlm (0.6.3)
+    simplecov (0.21.2)
+      docile (~> 1.1)
+      simplecov-html (~> 0.11)
+      simplecov_json_formatter (~> 0.1)
+    simplecov-html (0.12.3)
+    simplecov_json_formatter (0.1.4)
+    sync (0.5.0)
+    term-ansicolor (1.7.1)
+      tins (~> 1.0)
+    thor (1.2.1)
+    tins (1.31.1)
+      sync
+    tzinfo (2.0.5)
+      concurrent-ruby (~> 1.0)
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.8.2)
+    webrick (1.7.0)
+    webrobots (0.1.2)
+    wombat (3.0.0)
+      activesupport
+      mechanize (~> 2.8.5)
+      rest-client
 PLATFORMS
   ruby
 DEPENDENCIES
+  coveralls_reborn
   gcrawler!
   rake (~> 12.0)
   rspec (~> 3.0)
+  wombat
 BUNDLED WITH
    2.1.4

data/README.md CHANGED Viewed

@@ -1,6 +1,9 @@
 # Gcrawler
-Google search crawler for Ruby version.
+[![Gem Version](https://badge.fury.io/rb/gcrawler.svg)](https://badge.fury.io/rb/gcrawler)
+[![Coverage Status](https://coveralls.io/repos/github/rogerluo410/gcrawler/badge.svg?branch=master)](https://coveralls.io/github/rogerluo410/gcrawler?branch=master)
+Google search crawler for Ruby version. Crawling each links' text and url by keywords on Google.com.
 ## Installation
@@ -23,16 +26,29 @@ Or install it yourself as:
 ```ruby
     require 'gcrawler'
+    # Set proxy server, multiple IPs should be much safer than single IP.
     proxies = [
-    { ip: '127.0.0.1', port: '7890' }
+        { ip: '127.0.0.1', port: '7890' },
+        ...
     ]
+    # Exclude the hosts from results' links.
     exclude_hosts = [
-    'accounts.google.com',
-    'support.google.com'
+        'accounts.google.com',
+        'support.google.com'
+    ]
+    # Disable to search in the black domains.
+    black_domains = [
+        'www.google.at',
+        'www.google.bf'
     ]
-    google_crawler = GoogleCrawler.new(proxies: proxies, exclude_hosts: exclude_hosts)
+    google_crawler = GoogleCrawler.new(
+        proxies: proxies,
+        black_domains: black_domains,
+        exclude_hosts: exclude_hosts
+    )
     # Output: Mechanize::Page, see https://github.com/sparklemotion/mechanize
     pp google_crawler.search_as_page('お肉とチーズの専門店', 'ミートダルマ札幌店')
@@ -43,9 +59,12 @@ Or install it yourself as:
     # Output: ['url1', 'url2', ...]
     pp google_crawler.search_as_url('お肉とチーズの専門店', 'ミートダルマ札幌店', country: 'ja')
+    # Get the second page:
+    pp google_crawler.search_as_url('お肉とチーズの専門店', 'ミートダルマ札幌店', country: 'ja', start: 10)
 ```
-Function args definition:
+Function Input and Output definition:
     search_as_page:
         Args:
@@ -53,11 +72,44 @@ Function args definition:
             language (str, optional): Query language. Defaults to nil.
             num (uint, optional): Number of results per page(default is 10 per page). Defaults to nil.
             start (int, optional): Offset. Defaults to 0.
-            country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN
+            country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN.
+            pause (uint, optional): Set crawling delay seconds bwtween two requests.
+                                    Too short which may be forbidden by Google crawling monitor.
+                                    Defaults to 0.
         Return:
             Mechanize::Page, see https://github.com/sparklemotion/mechanize
+    search_as_url:
+        Args:
+            keywords (varargs): kw1, kw2, kw3, ...
+            language (str, optional): Query language. Defaults to nil.
+            num (uint, optional): Number of results per page(default is 10 per page). Defaults to nil.
+            start (int, optional): Offset. Defaults to 0.
+            country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN.
+            pause (uint, optional): Set crawling delay seconds bwtween two requests.
+                                    Too short which may be forbidden by Google crawling monitor.
+                                    Defaults to 0.
+        Return:
+            ['url1', 'url2', ...]
+    search_as_object:
+        Args:
+            keywords (varargs): kw1, kw2, kw3, ...
+            language (str, optional): Query language. Defaults to nil.
+            num (uint, optional): Number of results per page(default is 10 per page). Defaults to nil.
+            start (int, optional): Offset. Defaults to 0.
+            country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN.
+            pause (uint, optional): Set crawling delay seconds bwtween two requests.
+                                    Too short which may be forbidden by Google crawling monitor.
+                                    Defaults to 0.
+        Return:
+            [{text: xxx, url: xxx}, ...]
 ## Development

data/lib/gcrawler/search.rb CHANGED Viewed

@@ -6,8 +6,11 @@
 require 'wombat'
 require 'uri'
+require 'logger'
 require_relative './utils'
+LOGGER = Logger.new(STDOUT)
 # Crawl action
 class Crawler
   include Wombat::Crawler
@@ -27,7 +30,7 @@ class Crawler
     mechanize.set_proxy(*proxy) if proxy.length == 2
     mechanize.user_agent = user_agent
-    pp "proxy: #{proxy}, user_agent: #{user_agent}"
+    LOGGER.info "proxy: #{proxy}, user_agent: #{user_agent}"
   end
 end
@@ -44,15 +47,15 @@ class GoogleCrawler
   end
   # search as url
-  def search_as_url(*keywords, language: nil, num: nil, country: nil, start: 0)
-    search_as_page(*keywords, language: language, num: num, country: country, start: start)
+  def search_as_url(*keywords, language: nil, num: nil, country: nil, start: 0, pause: 0)
+    search_as_page(*keywords, language: language, num: num, country: country, start: start, pause: pause)
     filter_urls
   end
   # search as object with keys {'text', 'url'}
-  def search_as_object(*keywords, language: nil, num: nil, country: nil, start: 0)
-    search_as_page(*keywords, language: language, num: num, country: country, start: start)
+  def search_as_object(*keywords, language: nil, num: nil, country: nil, start: 0, pause: 0)
+    search_as_page(*keywords, language: language, num: num, country: country, start: start, pause: pause)
     generate_objects
   end
@@ -63,12 +66,14 @@ class GoogleCrawler
   #   language (str, optional): Query language. Defaults to nil.
   #   num (uint, optional): Number of results per page(default is 10 per page). Defaults to nil.
   #   start (int, optional): Offset. Defaults to 0.
-  #   country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN
+  #   country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN.
+  #   pause (uint, optional): Set crawling delay seconds bwtween two requests.
+  #                           Too short which may be forbidden by Google crawling monitor. Defaults to nil.
   #
   # Return:
   #   Mechanize::Page, see https://github.com/sparklemotion/mechanize
   #
-  def search_as_page(*keywords, language: nil, num: nil, country: nil, start: 0)
+  def search_as_page(*keywords, language: nil, num: nil, country: nil, start: 0, pause: 0)
     return if keywords.empty?
     query_str = "q=#{keywords.join('+')}&btnG=Search&gbv=1&safe=active&start=0"
@@ -79,15 +84,15 @@ class GoogleCrawler
     @crawler.query_str(query_str)
-    seconds = Utils.random_interval_time
-    pp "Crawling query string is #{query_str}, will be crawling after #{seconds} seconds..."
+    seconds = pause.zero? ? Utils.random_interval_time : pause
+    LOGGER.info "Crawling query string is #{query_str}, will be crawling after #{seconds} seconds..."
     sleep(seconds)
     @crawler.crawl
     raise "Fetch on Google failed with code #{@crawler.response_code}" unless @crawler.response_code == 200
-    pp 'Crawl on Google successfully...'
+    LOGGER.info 'Crawl on Google successfully...'
   end
   private

data/lib/gcrawler/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Gcrawler
-  VERSION = "0.1.0"
+  VERSION = '0.1.2'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: gcrawler
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.2
 platform: ruby
 authors:
 - rogerluo410
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2022-09-23 00:00:00.000000000 Z
+date: 2022-09-27 00:00:00.000000000 Z
 dependencies: []
 description: Crawling link text and link url by keywords on Google.com.
 email:
@@ -17,9 +17,11 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- ".github/workflows/ruby.yml"
 - ".gitignore"
 - ".rspec"
 - ".travis.yml"
+- CHANGELOG.md
 - CODE_OF_CONDUCT.md
 - Gemfile
 - Gemfile.lock