gcrawler 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0e95b1de0b3e42ef12b8b757dc7298bddc167cd7ffc2b1dd9ad50b8bf39480e9
4
- data.tar.gz: edc2c49da4450842be6f347cf407d90cc3e3b69f00473f58352deb1361f273bb
3
+ metadata.gz: b6e556eb21077d9c958169dcc23e580d82fb0d350bd6262fb3e0b2765d83f31c
4
+ data.tar.gz: 17ebf8852c1a5543a85ee29a45db2bc4eb87c1ea8ed12f3d56c190ea10107703
5
5
  SHA512:
6
- metadata.gz: 0c6551e3e9082d57fb8ed474dd6c5ce0ca26daaa289c041f622a5acae457673b885bfa15d5d6cf81b5e1c58176ac886226c3ff1dfcc03c6877a57a557ae7023e
7
- data.tar.gz: 52bd81434ae9974d9eb70f09a36f5716cf92f6d05c7ea5e70f71d7503d5f0ae37f84a38b9f3a44d29a0052bad5b344699f8c41ef0840ed808b50acb97fbe1f58
6
+ metadata.gz: 63083b51697eb73760ddb29a97ce7770f5404263751f6213471955e571c1e2a800fe92d2d6bc79c1d857c085eae1eecfe8cd82baa6864b3526e5b3581d9009ef
7
+ data.tar.gz: 0b0a35efabdf3f69a8f6f288586561b7124d2b2440b405840aff0a57ac78c833c9ca37129443eb7a92b37ab43d9e2508105b2522a93872f6c6c28884d3ea4d93
data/CHANGELOG.md CHANGED
@@ -1,4 +1,9 @@
1
+ ### version 0.1.2 (2022-09-27)
2
+
3
+ * Add pause argument for search functions
4
+
1
5
  ### version 0.1.1 (2022-09-24)
2
6
 
3
7
  * Add logger class for stdout
4
- * Add test specs
8
+ * Add test specs
9
+
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- gcrawler (0.1.1)
4
+ gcrawler (0.1.2)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/README.md CHANGED
@@ -26,16 +26,29 @@ Or install it yourself as:
26
26
  ```ruby
27
27
  require 'gcrawler'
28
28
 
29
+ # Set proxy server, multiple IPs should be much safer than single IP.
29
30
  proxies = [
30
- { ip: '127.0.0.1', port: '7890' }
31
+ { ip: '127.0.0.1', port: '7890' },
32
+ ...
31
33
  ]
32
34
 
35
+ # Exclude the hosts from results' links.
33
36
  exclude_hosts = [
34
- 'accounts.google.com',
35
- 'support.google.com'
37
+ 'accounts.google.com',
38
+ 'support.google.com'
36
39
  ]
37
40
 
38
- google_crawler = GoogleCrawler.new(proxies: proxies, exclude_hosts: exclude_hosts)
41
+ # Disable to search in the black domains.
42
+ black_domains = [
43
+ 'www.google.at',
44
+ 'www.google.bf'
45
+ ]
46
+
47
+ google_crawler = GoogleCrawler.new(
48
+ proxies: proxies,
49
+ black_domains: black_domains,
50
+ exclude_hosts: exclude_hosts
51
+ )
39
52
 
40
53
  # Output: Mechanize::Page, see https://github.com/sparklemotion/mechanize
41
54
  pp google_crawler.search_as_page('お肉とチーズの専門店', 'ミートダルマ札幌店')
@@ -51,7 +64,7 @@ Or install it yourself as:
51
64
 
52
65
  ```
53
66
 
54
- Function args definition:
67
+ Function Input and Output definition:
55
68
 
56
69
  search_as_page:
57
70
  Args:
@@ -59,11 +72,44 @@ Function args definition:
59
72
  language (str, optional): Query language. Defaults to nil.
60
73
  num (uint, optional): Number of results per page(default is 10 per page). Defaults to nil.
61
74
  start (int, optional): Offset. Defaults to 0.
62
- country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN
75
+ country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN.
76
+ pause (uint, optional): Set crawling delay seconds bwtween two requests.
77
+ Too short which may be forbidden by Google crawling monitor.
78
+ Defaults to 0.
63
79
 
64
80
  Return:
65
81
  Mechanize::Page, see https://github.com/sparklemotion/mechanize
66
82
 
83
+
84
+ search_as_url:
85
+ Args:
86
+ keywords (varargs): kw1, kw2, kw3, ...
87
+ language (str, optional): Query language. Defaults to nil.
88
+ num (uint, optional): Number of results per page(default is 10 per page). Defaults to nil.
89
+ start (int, optional): Offset. Defaults to 0.
90
+ country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN.
91
+ pause (uint, optional): Set crawling delay seconds bwtween two requests.
92
+ Too short which may be forbidden by Google crawling monitor.
93
+ Defaults to 0.
94
+
95
+ Return:
96
+ ['url1', 'url2', ...]
97
+
98
+
99
+ search_as_object:
100
+ Args:
101
+ keywords (varargs): kw1, kw2, kw3, ...
102
+ language (str, optional): Query language. Defaults to nil.
103
+ num (uint, optional): Number of results per page(default is 10 per page). Defaults to nil.
104
+ start (int, optional): Offset. Defaults to 0.
105
+ country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN.
106
+ pause (uint, optional): Set crawling delay seconds bwtween two requests.
107
+ Too short which may be forbidden by Google crawling monitor.
108
+ Defaults to 0.
109
+
110
+ Return:
111
+ [{text: xxx, url: xxx}, ...]
112
+
67
113
 
68
114
  ## Development
69
115
 
@@ -47,15 +47,15 @@ class GoogleCrawler
47
47
  end
48
48
 
49
49
  # search as url
50
- def search_as_url(*keywords, language: nil, num: nil, country: nil, start: 0)
51
- search_as_page(*keywords, language: language, num: num, country: country, start: start)
50
+ def search_as_url(*keywords, language: nil, num: nil, country: nil, start: 0, pause: 0)
51
+ search_as_page(*keywords, language: language, num: num, country: country, start: start, pause: pause)
52
52
 
53
53
  filter_urls
54
54
  end
55
55
 
56
56
  # search as object with keys {'text', 'url'}
57
- def search_as_object(*keywords, language: nil, num: nil, country: nil, start: 0)
58
- search_as_page(*keywords, language: language, num: num, country: country, start: start)
57
+ def search_as_object(*keywords, language: nil, num: nil, country: nil, start: 0, pause: 0)
58
+ search_as_page(*keywords, language: language, num: num, country: country, start: start, pause: pause)
59
59
 
60
60
  generate_objects
61
61
  end
@@ -66,12 +66,14 @@ class GoogleCrawler
66
66
  # language (str, optional): Query language. Defaults to nil.
67
67
  # num (uint, optional): Number of results per page(default is 10 per page). Defaults to nil.
68
68
  # start (int, optional): Offset. Defaults to 0.
69
- # country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN
69
+ # country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN.
70
+ # pause (uint, optional): Set crawling delay seconds bwtween two requests.
71
+ # Too short which may be forbidden by Google crawling monitor. Defaults to nil.
70
72
  #
71
73
  # Return:
72
74
  # Mechanize::Page, see https://github.com/sparklemotion/mechanize
73
75
  #
74
- def search_as_page(*keywords, language: nil, num: nil, country: nil, start: 0)
76
+ def search_as_page(*keywords, language: nil, num: nil, country: nil, start: 0, pause: 0)
75
77
  return if keywords.empty?
76
78
 
77
79
  query_str = "q=#{keywords.join('+')}&btnG=Search&gbv=1&safe=active&start=0"
@@ -82,7 +84,7 @@ class GoogleCrawler
82
84
 
83
85
  @crawler.query_str(query_str)
84
86
 
85
- seconds = Utils.random_interval_time
87
+ seconds = pause.zero? ? Utils.random_interval_time : pause
86
88
  LOGGER.info "Crawling query string is #{query_str}, will be crawling after #{seconds} seconds..."
87
89
  sleep(seconds)
88
90
 
@@ -1,3 +1,3 @@
1
1
  module Gcrawler
2
- VERSION = '0.1.1'
2
+ VERSION = '0.1.2'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gcrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - rogerluo410
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-09-25 00:00:00.000000000 Z
11
+ date: 2022-09-27 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Crawling link text and link url by keywords on Google.com.
14
14
  email: