gcrawler 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.travis.yml +6 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +7 -0
- data/Gemfile.lock +34 -0
- data/LICENSE.txt +21 -0
- data/README.md +82 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/examples/example.rb +21 -0
- data/gcrawler.gemspec +29 -0
- data/lib/gcrawler/data/domains.txt +193 -0
- data/lib/gcrawler/data/user_agents.txt +899 -0
- data/lib/gcrawler/search.rb +122 -0
- data/lib/gcrawler/utils.rb +37 -0
- data/lib/gcrawler/version.rb +3 -0
- data/lib/gcrawler.rb +2 -0
- metadata +66 -0
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# Crawl google result by keywords
|
|
2
|
+
#
|
|
3
|
+
# Dependences:
|
|
4
|
+
# gem 'wombat'
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
require 'wombat'
|
|
8
|
+
require 'uri'
|
|
9
|
+
require_relative './utils'
|
|
10
|
+
|
|
11
|
+
# Crawl action
|
|
12
|
+
class Crawler
|
|
13
|
+
include Wombat::Crawler
|
|
14
|
+
|
|
15
|
+
def query_str(val)
|
|
16
|
+
setting
|
|
17
|
+
path "/search?#{val}"
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
base_url "https://#{Utils.random_domain}/"
|
|
21
|
+
|
|
22
|
+
private
|
|
23
|
+
|
|
24
|
+
def setting
|
|
25
|
+
proxy = Utils.random_proxy
|
|
26
|
+
user_agent = Utils.random_user_agent
|
|
27
|
+
mechanize.set_proxy(*proxy) if proxy.length == 2
|
|
28
|
+
mechanize.user_agent = user_agent
|
|
29
|
+
|
|
30
|
+
pp "proxy: #{proxy}, user_agent: #{user_agent}"
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Google crawler
|
|
35
|
+
class GoogleCrawler
|
|
36
|
+
attr_accessor :exclude_hosts
|
|
37
|
+
|
|
38
|
+
def initialize(proxies: [], black_domains: [], exclude_hosts: [])
|
|
39
|
+
@exclude_hosts = exclude_hosts
|
|
40
|
+
Utils.proxies = proxies
|
|
41
|
+
Utils.black_domains = black_domains
|
|
42
|
+
|
|
43
|
+
@crawler = Crawler.new
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# search as url
|
|
47
|
+
def search_as_url(*keywords, language: nil, num: nil, country: nil, start: 0)
|
|
48
|
+
search_as_page(*keywords, language: language, num: num, country: country, start: start)
|
|
49
|
+
|
|
50
|
+
filter_urls
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# search as object with keys {'text', 'url'}
|
|
54
|
+
def search_as_object(*keywords, language: nil, num: nil, country: nil, start: 0)
|
|
55
|
+
search_as_page(*keywords, language: language, num: num, country: country, start: start)
|
|
56
|
+
|
|
57
|
+
generate_objects
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# search as page
|
|
61
|
+
# Args:
|
|
62
|
+
# keywords (varargs): kw1, kw2, kw3, ...
|
|
63
|
+
# language (str, optional): Query language. Defaults to nil.
|
|
64
|
+
# num (uint, optional): Number of results per page(default is 10 per page). Defaults to nil.
|
|
65
|
+
# start (int, optional): Offset. Defaults to 0.
|
|
66
|
+
# country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN
|
|
67
|
+
#
|
|
68
|
+
# Return:
|
|
69
|
+
# Mechanize::Page, see https://github.com/sparklemotion/mechanize
|
|
70
|
+
#
|
|
71
|
+
def search_as_page(*keywords, language: nil, num: nil, country: nil, start: 0)
|
|
72
|
+
return if keywords.empty?
|
|
73
|
+
|
|
74
|
+
query_str = "q=#{keywords.join('+')}&btnG=Search&gbv=1&safe=active&start=0"
|
|
75
|
+
query_str += "&ln=#{language}" unless language.blank?
|
|
76
|
+
query_str += "&num=#{num.to_i}" unless num.blank?
|
|
77
|
+
query_str += "&cr=#{country}" unless country.blank?
|
|
78
|
+
query_str.gsub!(/start=0/, "start=#{start}") unless start == 0
|
|
79
|
+
|
|
80
|
+
@crawler.query_str(query_str)
|
|
81
|
+
|
|
82
|
+
seconds = Utils.random_interval_time
|
|
83
|
+
pp "Crawling query string is #{query_str}, will be crawling after #{seconds} seconds..."
|
|
84
|
+
sleep(seconds)
|
|
85
|
+
|
|
86
|
+
@crawler.crawl
|
|
87
|
+
|
|
88
|
+
raise "Fetch on Google failed with code #{@crawler.response_code}" unless @crawler.response_code == 200
|
|
89
|
+
|
|
90
|
+
pp 'Crawl on Google successfully...'
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
private
|
|
94
|
+
|
|
95
|
+
def filter_urls
|
|
96
|
+
urls = @crawler.page&.links&.map do |link_node|
|
|
97
|
+
uri_str = link_node.uri.to_s
|
|
98
|
+
|
|
99
|
+
if uri_str.start_with?(%r{/url\?q=}) && !@exclude_hosts.include?(URI.parse(uri_str.split(%r{/url\?q=})[1])&.host)
|
|
100
|
+
real_uri = uri_str.split(%r{/url\?q=})[1]
|
|
101
|
+
end
|
|
102
|
+
real_uri
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
urls.compact
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def generate_objects
|
|
109
|
+
objects = @crawler.page&.links&.map do |link_node|
|
|
110
|
+
uri_str = link_node.uri.to_s
|
|
111
|
+
|
|
112
|
+
if uri_str.start_with?(%r{/url\?q=}) && !@exclude_hosts.include?(URI.parse(uri_str.split(%r{/url\?q=})[1])&.host)
|
|
113
|
+
node = { text: link_node.text,
|
|
114
|
+
url: uri_str.split(%r{/url\?q=})[1] }
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
node
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
objects.compact
|
|
121
|
+
end
|
|
122
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
module Utils
|
|
2
|
+
@domains = File.readlines(File.join(File.dirname(__FILE__), './data/domains.txt'))
|
|
3
|
+
@user_agents = File.readlines(File.join(File.dirname(__FILE__), './data/user_agents.txt'))
|
|
4
|
+
@proxies = []
|
|
5
|
+
@black_domains = []
|
|
6
|
+
|
|
7
|
+
class << self
|
|
8
|
+
attr_reader :proxies, :black_domains
|
|
9
|
+
|
|
10
|
+
def proxies=(proxies)
|
|
11
|
+
@proxies = proxies unless proxies.empty?
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def black_domains=(black_domains)
|
|
15
|
+
@black_domains = black_domains unless black_domains.empty?
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def random_domain
|
|
19
|
+
(@domains - @black_domains).sample.gsub(/[\r\n]+/, '')
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def random_user_agent
|
|
23
|
+
@user_agents.sample.gsub(/[\r\n]+/, '')
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def random_interval_time
|
|
27
|
+
(20..59).to_a.sample
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def random_proxy
|
|
31
|
+
proxy = @proxies.sample
|
|
32
|
+
return [proxy[:ip], proxy[:port].to_i] unless proxy.nil?
|
|
33
|
+
|
|
34
|
+
[]
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
data/lib/gcrawler.rb
ADDED
metadata
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: gcrawler
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- rogerluo410
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: exe
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2022-09-23 00:00:00.000000000 Z
|
|
12
|
+
dependencies: []
|
|
13
|
+
description: Crawling link text and link url by keywords on Google.com.
|
|
14
|
+
email:
|
|
15
|
+
- rogerluo410@gmail.com
|
|
16
|
+
executables: []
|
|
17
|
+
extensions: []
|
|
18
|
+
extra_rdoc_files: []
|
|
19
|
+
files:
|
|
20
|
+
- ".gitignore"
|
|
21
|
+
- ".rspec"
|
|
22
|
+
- ".travis.yml"
|
|
23
|
+
- CODE_OF_CONDUCT.md
|
|
24
|
+
- Gemfile
|
|
25
|
+
- Gemfile.lock
|
|
26
|
+
- LICENSE.txt
|
|
27
|
+
- README.md
|
|
28
|
+
- Rakefile
|
|
29
|
+
- bin/console
|
|
30
|
+
- bin/setup
|
|
31
|
+
- examples/example.rb
|
|
32
|
+
- gcrawler.gemspec
|
|
33
|
+
- lib/gcrawler.rb
|
|
34
|
+
- lib/gcrawler/data/domains.txt
|
|
35
|
+
- lib/gcrawler/data/user_agents.txt
|
|
36
|
+
- lib/gcrawler/search.rb
|
|
37
|
+
- lib/gcrawler/utils.rb
|
|
38
|
+
- lib/gcrawler/version.rb
|
|
39
|
+
homepage: https://github.com/rogerluo410/gcrawler
|
|
40
|
+
licenses:
|
|
41
|
+
- MIT
|
|
42
|
+
metadata:
|
|
43
|
+
allowed_push_host: https://rubygems.org/
|
|
44
|
+
homepage_uri: https://github.com/rogerluo410/gcrawler
|
|
45
|
+
source_code_uri: https://github.com/rogerluo410/gcrawler
|
|
46
|
+
changelog_uri: https://github.com/rogerluo410/gcrawler/CHANGELOG.md
|
|
47
|
+
post_install_message:
|
|
48
|
+
rdoc_options: []
|
|
49
|
+
require_paths:
|
|
50
|
+
- lib
|
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
52
|
+
requirements:
|
|
53
|
+
- - ">="
|
|
54
|
+
- !ruby/object:Gem::Version
|
|
55
|
+
version: 2.3.0
|
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - ">="
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: '0'
|
|
61
|
+
requirements: []
|
|
62
|
+
rubygems_version: 3.1.6
|
|
63
|
+
signing_key:
|
|
64
|
+
specification_version: 4
|
|
65
|
+
summary: Google search crawler for Ruby version.
|
|
66
|
+
test_files: []
|