gcrawler 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,122 @@
1
+ # Crawl google result by keywords
2
+ #
3
+ # Dependences:
4
+ # gem 'wombat'
5
+ #
6
+
7
+ require 'wombat'
8
+ require 'uri'
9
+ require_relative './utils'
10
+
11
+ # Crawl action
12
+ class Crawler
13
+ include Wombat::Crawler
14
+
15
+ def query_str(val)
16
+ setting
17
+ path "/search?#{val}"
18
+ end
19
+
20
+ base_url "https://#{Utils.random_domain}/"
21
+
22
+ private
23
+
24
+ def setting
25
+ proxy = Utils.random_proxy
26
+ user_agent = Utils.random_user_agent
27
+ mechanize.set_proxy(*proxy) if proxy.length == 2
28
+ mechanize.user_agent = user_agent
29
+
30
+ pp "proxy: #{proxy}, user_agent: #{user_agent}"
31
+ end
32
+ end
33
+
34
+ # Google crawler
35
+ class GoogleCrawler
36
+ attr_accessor :exclude_hosts
37
+
38
+ def initialize(proxies: [], black_domains: [], exclude_hosts: [])
39
+ @exclude_hosts = exclude_hosts
40
+ Utils.proxies = proxies
41
+ Utils.black_domains = black_domains
42
+
43
+ @crawler = Crawler.new
44
+ end
45
+
46
+ # search as url
47
+ def search_as_url(*keywords, language: nil, num: nil, country: nil, start: 0)
48
+ search_as_page(*keywords, language: language, num: num, country: country, start: start)
49
+
50
+ filter_urls
51
+ end
52
+
53
+ # search as object with keys {'text', 'url'}
54
+ def search_as_object(*keywords, language: nil, num: nil, country: nil, start: 0)
55
+ search_as_page(*keywords, language: language, num: num, country: country, start: start)
56
+
57
+ generate_objects
58
+ end
59
+
60
+ # search as page
61
+ # Args:
62
+ # keywords (varargs): kw1, kw2, kw3, ...
63
+ # language (str, optional): Query language. Defaults to nil.
64
+ # num (uint, optional): Number of results per page(default is 10 per page). Defaults to nil.
65
+ # start (int, optional): Offset. Defaults to 0.
66
+ # country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN
67
+ #
68
+ # Return:
69
+ # Mechanize::Page, see https://github.com/sparklemotion/mechanize
70
+ #
71
+ def search_as_page(*keywords, language: nil, num: nil, country: nil, start: 0)
72
+ return if keywords.empty?
73
+
74
+ query_str = "q=#{keywords.join('+')}&btnG=Search&gbv=1&safe=active&start=0"
75
+ query_str += "&ln=#{language}" unless language.blank?
76
+ query_str += "&num=#{num.to_i}" unless num.blank?
77
+ query_str += "&cr=#{country}" unless country.blank?
78
+ query_str.gsub!(/start=0/, "start=#{start}") unless start == 0
79
+
80
+ @crawler.query_str(query_str)
81
+
82
+ seconds = Utils.random_interval_time
83
+ pp "Crawling query string is #{query_str}, will be crawling after #{seconds} seconds..."
84
+ sleep(seconds)
85
+
86
+ @crawler.crawl
87
+
88
+ raise "Fetch on Google failed with code #{@crawler.response_code}" unless @crawler.response_code == 200
89
+
90
+ pp 'Crawl on Google successfully...'
91
+ end
92
+
93
+ private
94
+
95
+ def filter_urls
96
+ urls = @crawler.page&.links&.map do |link_node|
97
+ uri_str = link_node.uri.to_s
98
+
99
+ if uri_str.start_with?(%r{/url\?q=}) && !@exclude_hosts.include?(URI.parse(uri_str.split(%r{/url\?q=})[1])&.host)
100
+ real_uri = uri_str.split(%r{/url\?q=})[1]
101
+ end
102
+ real_uri
103
+ end
104
+
105
+ urls.compact
106
+ end
107
+
108
+ def generate_objects
109
+ objects = @crawler.page&.links&.map do |link_node|
110
+ uri_str = link_node.uri.to_s
111
+
112
+ if uri_str.start_with?(%r{/url\?q=}) && !@exclude_hosts.include?(URI.parse(uri_str.split(%r{/url\?q=})[1])&.host)
113
+ node = { text: link_node.text,
114
+ url: uri_str.split(%r{/url\?q=})[1] }
115
+ end
116
+
117
+ node
118
+ end
119
+
120
+ objects.compact
121
+ end
122
+ end
@@ -0,0 +1,37 @@
1
+ module Utils
2
+ @domains = File.readlines(File.join(File.dirname(__FILE__), './data/domains.txt'))
3
+ @user_agents = File.readlines(File.join(File.dirname(__FILE__), './data/user_agents.txt'))
4
+ @proxies = []
5
+ @black_domains = []
6
+
7
+ class << self
8
+ attr_reader :proxies, :black_domains
9
+
10
+ def proxies=(proxies)
11
+ @proxies = proxies unless proxies.empty?
12
+ end
13
+
14
+ def black_domains=(black_domains)
15
+ @black_domains = black_domains unless black_domains.empty?
16
+ end
17
+
18
+ def random_domain
19
+ (@domains - @black_domains).sample.gsub(/[\r\n]+/, '')
20
+ end
21
+
22
+ def random_user_agent
23
+ @user_agents.sample.gsub(/[\r\n]+/, '')
24
+ end
25
+
26
+ def random_interval_time
27
+ (20..59).to_a.sample
28
+ end
29
+
30
+ def random_proxy
31
+ proxy = @proxies.sample
32
+ return [proxy[:ip], proxy[:port].to_i] unless proxy.nil?
33
+
34
+ []
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,3 @@
1
+ module Gcrawler
2
+ VERSION = "0.1.0"
3
+ end
data/lib/gcrawler.rb ADDED
@@ -0,0 +1,2 @@
1
+ require_relative 'gcrawler/version'
2
+ require_relative 'gcrawler/search'
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gcrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - rogerluo410
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2022-09-23 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Crawling link text and link url by keywords on Google.com.
14
+ email:
15
+ - rogerluo410@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - ".gitignore"
21
+ - ".rspec"
22
+ - ".travis.yml"
23
+ - CODE_OF_CONDUCT.md
24
+ - Gemfile
25
+ - Gemfile.lock
26
+ - LICENSE.txt
27
+ - README.md
28
+ - Rakefile
29
+ - bin/console
30
+ - bin/setup
31
+ - examples/example.rb
32
+ - gcrawler.gemspec
33
+ - lib/gcrawler.rb
34
+ - lib/gcrawler/data/domains.txt
35
+ - lib/gcrawler/data/user_agents.txt
36
+ - lib/gcrawler/search.rb
37
+ - lib/gcrawler/utils.rb
38
+ - lib/gcrawler/version.rb
39
+ homepage: https://github.com/rogerluo410/gcrawler
40
+ licenses:
41
+ - MIT
42
+ metadata:
43
+ allowed_push_host: https://rubygems.org/
44
+ homepage_uri: https://github.com/rogerluo410/gcrawler
45
+ source_code_uri: https://github.com/rogerluo410/gcrawler
46
+ changelog_uri: https://github.com/rogerluo410/gcrawler/CHANGELOG.md
47
+ post_install_message:
48
+ rdoc_options: []
49
+ require_paths:
50
+ - lib
51
+ required_ruby_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: 2.3.0
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ requirements: []
62
+ rubygems_version: 3.1.6
63
+ signing_key:
64
+ specification_version: 4
65
+ summary: Google search crawler for Ruby version.
66
+ test_files: []