gcrawler 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3c1b8fc3cdd83389fd819bb922e5a40c5b6d0e31efad64d44643e686dacb6903
4
- data.tar.gz: a333883b2912929e8b19d46f06e1ad3ee2cf7d455b9925cc8cce22ce6d7a30dd
3
+ metadata.gz: b6e556eb21077d9c958169dcc23e580d82fb0d350bd6262fb3e0b2765d83f31c
4
+ data.tar.gz: 17ebf8852c1a5543a85ee29a45db2bc4eb87c1ea8ed12f3d56c190ea10107703
5
5
  SHA512:
6
- metadata.gz: 45a235679f7d963eee03ddb87d8cb76bc8fd81218016443c57b95b2aae737678095eec3d6b1a1c48b281f2b29339a4048dba3c8cc702b13afeb9bb6a750befb4
7
- data.tar.gz: 3fee1f2b977bc448bee35d96736017b171f7ad4ffd4595e2e1fa488e33d914bf50341206c885363690e5be3a454f5c56207298877faac89436e9e0f0a5cfc5d7
6
+ metadata.gz: 63083b51697eb73760ddb29a97ce7770f5404263751f6213471955e571c1e2a800fe92d2d6bc79c1d857c085eae1eecfe8cd82baa6864b3526e5b3581d9009ef
7
+ data.tar.gz: 0b0a35efabdf3f69a8f6f288586561b7124d2b2440b405840aff0a57ac78c833c9ca37129443eb7a92b37ab43d9e2508105b2522a93872f6c6c28884d3ea4d93
@@ -0,0 +1,38 @@
1
+ # This workflow uses actions that are not certified by GitHub.
2
+ # They are provided by a third-party and are governed by
3
+ # separate terms of service, privacy policy, and support
4
+ # documentation.
5
+ # This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake
6
+ # For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby
7
+
8
+ name: Ruby
9
+
10
+ on:
11
+ push:
12
+ branches: [ "master" ]
13
+ pull_request:
14
+ branches: [ "master" ]
15
+
16
+ permissions:
17
+ contents: read
18
+
19
+ jobs:
20
+ test:
21
+
22
+ runs-on: ubuntu-latest
23
+ strategy:
24
+ matrix:
25
+ ruby-version: ['2.7', '3.0']
26
+
27
+ steps:
28
+ - uses: actions/checkout@v3
29
+ - name: Set up Ruby
30
+ # To automatically get bug fixes and new Ruby versions for ruby/setup-ruby,
31
+ # change this to (see https://github.com/ruby/setup-ruby#versioning):
32
+ # uses: ruby/setup-ruby@v1
33
+ uses: ruby/setup-ruby@0a29871fe2b0200a17a4497bae54fe5df0d973aa # v1.115.3
34
+ with:
35
+ ruby-version: ${{ matrix.ruby-version }}
36
+ bundler-cache: true # runs 'bundle install' and caches installed gems automatically
37
+ - name: Run tests
38
+ run: COVERALLS_REPO_TOKEN=${{ secrets.REPO_TOKEN }} bundle exec rake
data/.gitignore CHANGED
@@ -9,3 +9,4 @@
9
9
 
10
10
  # rspec failure tracking
11
11
  .rspec_status
12
+ .coveralls.yml
data/CHANGELOG.md ADDED
@@ -0,0 +1,9 @@
1
+ ### version 0.1.2 (2022-09-27)
2
+
3
+ * Add pause argument for search functions
4
+
5
+ ### version 0.1.1 (2022-09-24)
6
+
7
+ * Add logger class for stdout
8
+ * Add test specs
9
+
data/Gemfile CHANGED
@@ -1,7 +1,10 @@
1
- source "https://rubygems.org"
1
+ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in gcrawler.gemspec
4
4
  gemspec
5
5
 
6
- gem "rake", "~> 12.0"
7
- gem "rspec", "~> 3.0"
6
+ gem 'coveralls_reborn', require: false
7
+ gem 'rake', '~> 12.0'
8
+ gem 'rspec', '~> 3.0'
9
+
10
+ gem 'wombat'
data/Gemfile.lock CHANGED
@@ -1,13 +1,65 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- gcrawler (0.1.0)
4
+ gcrawler (0.1.2)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
+ activesupport (7.0.4)
10
+ concurrent-ruby (~> 1.0, >= 1.0.2)
11
+ i18n (>= 1.6, < 2)
12
+ minitest (>= 5.1)
13
+ tzinfo (~> 2.0)
14
+ addressable (2.8.1)
15
+ public_suffix (>= 2.0.2, < 6.0)
16
+ concurrent-ruby (1.1.10)
17
+ connection_pool (2.3.0)
18
+ coveralls_reborn (0.25.0)
19
+ simplecov (>= 0.18.1, < 0.22.0)
20
+ term-ansicolor (~> 1.6)
21
+ thor (>= 0.20.3, < 2.0)
22
+ tins (~> 1.16)
9
23
  diff-lcs (1.5.0)
24
+ docile (1.4.0)
25
+ domain_name (0.5.20190701)
26
+ unf (>= 0.0.5, < 1.0.0)
27
+ http-accept (1.7.0)
28
+ http-cookie (1.0.5)
29
+ domain_name (~> 0.5)
30
+ i18n (1.12.0)
31
+ concurrent-ruby (~> 1.0)
32
+ mechanize (2.8.5)
33
+ addressable (~> 2.8)
34
+ domain_name (~> 0.5, >= 0.5.20190701)
35
+ http-cookie (~> 1.0, >= 1.0.3)
36
+ mime-types (~> 3.0)
37
+ net-http-digest_auth (~> 1.4, >= 1.4.1)
38
+ net-http-persistent (>= 2.5.2, < 5.0.dev)
39
+ nokogiri (~> 1.11, >= 1.11.2)
40
+ rubyntlm (~> 0.6, >= 0.6.3)
41
+ webrick (~> 1.7)
42
+ webrobots (~> 0.1.2)
43
+ mime-types (3.4.1)
44
+ mime-types-data (~> 3.2015)
45
+ mime-types-data (3.2022.0105)
46
+ mini_portile2 (2.8.0)
47
+ minitest (5.16.3)
48
+ net-http-digest_auth (1.4.1)
49
+ net-http-persistent (4.0.1)
50
+ connection_pool (~> 2.2)
51
+ netrc (0.11.0)
52
+ nokogiri (1.13.8)
53
+ mini_portile2 (~> 2.8.0)
54
+ racc (~> 1.4)
55
+ public_suffix (5.0.0)
56
+ racc (1.6.0)
10
57
  rake (12.3.3)
58
+ rest-client (2.1.0)
59
+ http-accept (>= 1.7.0, < 2.0)
60
+ http-cookie (>= 1.0.2, < 2.0)
61
+ mime-types (>= 1.16, < 4.0)
62
+ netrc (~> 0.8)
11
63
  rspec (3.11.0)
12
64
  rspec-core (~> 3.11.0)
13
65
  rspec-expectations (~> 3.11.0)
@@ -21,14 +73,40 @@ GEM
21
73
  diff-lcs (>= 1.2.0, < 2.0)
22
74
  rspec-support (~> 3.11.0)
23
75
  rspec-support (3.11.1)
76
+ rubyntlm (0.6.3)
77
+ simplecov (0.21.2)
78
+ docile (~> 1.1)
79
+ simplecov-html (~> 0.11)
80
+ simplecov_json_formatter (~> 0.1)
81
+ simplecov-html (0.12.3)
82
+ simplecov_json_formatter (0.1.4)
83
+ sync (0.5.0)
84
+ term-ansicolor (1.7.1)
85
+ tins (~> 1.0)
86
+ thor (1.2.1)
87
+ tins (1.31.1)
88
+ sync
89
+ tzinfo (2.0.5)
90
+ concurrent-ruby (~> 1.0)
91
+ unf (0.1.4)
92
+ unf_ext
93
+ unf_ext (0.0.8.2)
94
+ webrick (1.7.0)
95
+ webrobots (0.1.2)
96
+ wombat (3.0.0)
97
+ activesupport
98
+ mechanize (~> 2.8.5)
99
+ rest-client
24
100
 
25
101
  PLATFORMS
26
102
  ruby
27
103
 
28
104
  DEPENDENCIES
105
+ coveralls_reborn
29
106
  gcrawler!
30
107
  rake (~> 12.0)
31
108
  rspec (~> 3.0)
109
+ wombat
32
110
 
33
111
  BUNDLED WITH
34
112
  2.1.4
data/README.md CHANGED
@@ -1,6 +1,9 @@
1
1
  # Gcrawler
2
2
 
3
- Google search crawler for Ruby version.
3
+ [![Gem Version](https://badge.fury.io/rb/gcrawler.svg)](https://badge.fury.io/rb/gcrawler)
4
+ [![Coverage Status](https://coveralls.io/repos/github/rogerluo410/gcrawler/badge.svg?branch=master)](https://coveralls.io/github/rogerluo410/gcrawler?branch=master)
5
+
6
+ Google search crawler for Ruby version. Crawling each links' text and url by keywords on Google.com.
4
7
 
5
8
  ## Installation
6
9
 
@@ -23,16 +26,29 @@ Or install it yourself as:
23
26
  ```ruby
24
27
  require 'gcrawler'
25
28
 
29
+ # Set proxy server, multiple IPs should be much safer than single IP.
26
30
  proxies = [
27
- { ip: '127.0.0.1', port: '7890' }
31
+ { ip: '127.0.0.1', port: '7890' },
32
+ ...
28
33
  ]
29
34
 
35
+ # Exclude the hosts from results' links.
30
36
  exclude_hosts = [
31
- 'accounts.google.com',
32
- 'support.google.com'
37
+ 'accounts.google.com',
38
+ 'support.google.com'
39
+ ]
40
+
41
+ # Disable to search in the black domains.
42
+ black_domains = [
43
+ 'www.google.at',
44
+ 'www.google.bf'
33
45
  ]
34
46
 
35
- google_crawler = GoogleCrawler.new(proxies: proxies, exclude_hosts: exclude_hosts)
47
+ google_crawler = GoogleCrawler.new(
48
+ proxies: proxies,
49
+ black_domains: black_domains,
50
+ exclude_hosts: exclude_hosts
51
+ )
36
52
 
37
53
  # Output: Mechanize::Page, see https://github.com/sparklemotion/mechanize
38
54
  pp google_crawler.search_as_page('お肉とチーズの専門店', 'ミートダルマ札幌店')
@@ -43,9 +59,12 @@ Or install it yourself as:
43
59
  # Output: ['url1', 'url2', ...]
44
60
  pp google_crawler.search_as_url('お肉とチーズの専門店', 'ミートダルマ札幌店', country: 'ja')
45
61
 
62
+ # Get the second page:
63
+ pp google_crawler.search_as_url('お肉とチーズの専門店', 'ミートダルマ札幌店', country: 'ja', start: 10)
64
+
46
65
  ```
47
66
 
48
- Function args definition:
67
+ Function Input and Output definition:
49
68
 
50
69
  search_as_page:
51
70
  Args:
@@ -53,11 +72,44 @@ Function args definition:
53
72
  language (str, optional): Query language. Defaults to nil.
54
73
  num (uint, optional): Number of results per page(default is 10 per page). Defaults to nil.
55
74
  start (int, optional): Offset. Defaults to 0.
56
- country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN
75
+ country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN.
76
+ pause (uint, optional): Set crawling delay seconds bwtween two requests.
77
+ Too short which may be forbidden by Google crawling monitor.
78
+ Defaults to 0.
57
79
 
58
80
  Return:
59
81
  Mechanize::Page, see https://github.com/sparklemotion/mechanize
60
82
 
83
+
84
+ search_as_url:
85
+ Args:
86
+ keywords (varargs): kw1, kw2, kw3, ...
87
+ language (str, optional): Query language. Defaults to nil.
88
+ num (uint, optional): Number of results per page(default is 10 per page). Defaults to nil.
89
+ start (int, optional): Offset. Defaults to 0.
90
+ country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN.
91
+ pause (uint, optional): Set crawling delay seconds bwtween two requests.
92
+ Too short which may be forbidden by Google crawling monitor.
93
+ Defaults to 0.
94
+
95
+ Return:
96
+ ['url1', 'url2', ...]
97
+
98
+
99
+ search_as_object:
100
+ Args:
101
+ keywords (varargs): kw1, kw2, kw3, ...
102
+ language (str, optional): Query language. Defaults to nil.
103
+ num (uint, optional): Number of results per page(default is 10 per page). Defaults to nil.
104
+ start (int, optional): Offset. Defaults to 0.
105
+ country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN.
106
+ pause (uint, optional): Set crawling delay seconds bwtween two requests.
107
+ Too short which may be forbidden by Google crawling monitor.
108
+ Defaults to 0.
109
+
110
+ Return:
111
+ [{text: xxx, url: xxx}, ...]
112
+
61
113
 
62
114
  ## Development
63
115
 
@@ -6,8 +6,11 @@
6
6
 
7
7
  require 'wombat'
8
8
  require 'uri'
9
+ require 'logger'
9
10
  require_relative './utils'
10
11
 
12
+ LOGGER = Logger.new(STDOUT)
13
+
11
14
  # Crawl action
12
15
  class Crawler
13
16
  include Wombat::Crawler
@@ -27,7 +30,7 @@ class Crawler
27
30
  mechanize.set_proxy(*proxy) if proxy.length == 2
28
31
  mechanize.user_agent = user_agent
29
32
 
30
- pp "proxy: #{proxy}, user_agent: #{user_agent}"
33
+ LOGGER.info "proxy: #{proxy}, user_agent: #{user_agent}"
31
34
  end
32
35
  end
33
36
 
@@ -44,15 +47,15 @@ class GoogleCrawler
44
47
  end
45
48
 
46
49
  # search as url
47
- def search_as_url(*keywords, language: nil, num: nil, country: nil, start: 0)
48
- search_as_page(*keywords, language: language, num: num, country: country, start: start)
50
+ def search_as_url(*keywords, language: nil, num: nil, country: nil, start: 0, pause: 0)
51
+ search_as_page(*keywords, language: language, num: num, country: country, start: start, pause: pause)
49
52
 
50
53
  filter_urls
51
54
  end
52
55
 
53
56
  # search as object with keys {'text', 'url'}
54
- def search_as_object(*keywords, language: nil, num: nil, country: nil, start: 0)
55
- search_as_page(*keywords, language: language, num: num, country: country, start: start)
57
+ def search_as_object(*keywords, language: nil, num: nil, country: nil, start: 0, pause: 0)
58
+ search_as_page(*keywords, language: language, num: num, country: country, start: start, pause: pause)
56
59
 
57
60
  generate_objects
58
61
  end
@@ -63,12 +66,14 @@ class GoogleCrawler
63
66
  # language (str, optional): Query language. Defaults to nil.
64
67
  # num (uint, optional): Number of results per page(default is 10 per page). Defaults to nil.
65
68
  # start (int, optional): Offset. Defaults to 0.
66
- # country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN
69
+ # country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN.
70
+ # pause (uint, optional): Set crawling delay seconds bwtween two requests.
71
+ # Too short which may be forbidden by Google crawling monitor. Defaults to nil.
67
72
  #
68
73
  # Return:
69
74
  # Mechanize::Page, see https://github.com/sparklemotion/mechanize
70
75
  #
71
- def search_as_page(*keywords, language: nil, num: nil, country: nil, start: 0)
76
+ def search_as_page(*keywords, language: nil, num: nil, country: nil, start: 0, pause: 0)
72
77
  return if keywords.empty?
73
78
 
74
79
  query_str = "q=#{keywords.join('+')}&btnG=Search&gbv=1&safe=active&start=0"
@@ -79,15 +84,15 @@ class GoogleCrawler
79
84
 
80
85
  @crawler.query_str(query_str)
81
86
 
82
- seconds = Utils.random_interval_time
83
- pp "Crawling query string is #{query_str}, will be crawling after #{seconds} seconds..."
87
+ seconds = pause.zero? ? Utils.random_interval_time : pause
88
+ LOGGER.info "Crawling query string is #{query_str}, will be crawling after #{seconds} seconds..."
84
89
  sleep(seconds)
85
90
 
86
91
  @crawler.crawl
87
92
 
88
93
  raise "Fetch on Google failed with code #{@crawler.response_code}" unless @crawler.response_code == 200
89
94
 
90
- pp 'Crawl on Google successfully...'
95
+ LOGGER.info 'Crawl on Google successfully...'
91
96
  end
92
97
 
93
98
  private
@@ -1,3 +1,3 @@
1
1
  module Gcrawler
2
- VERSION = "0.1.0"
2
+ VERSION = '0.1.2'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gcrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - rogerluo410
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-09-23 00:00:00.000000000 Z
11
+ date: 2022-09-27 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Crawling link text and link url by keywords on Google.com.
14
14
  email:
@@ -17,9 +17,11 @@ executables: []
17
17
  extensions: []
18
18
  extra_rdoc_files: []
19
19
  files:
20
+ - ".github/workflows/ruby.yml"
20
21
  - ".gitignore"
21
22
  - ".rspec"
22
23
  - ".travis.yml"
24
+ - CHANGELOG.md
23
25
  - CODE_OF_CONDUCT.md
24
26
  - Gemfile
25
27
  - Gemfile.lock