bots 1.0.1 → 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (5) hide show
  1. checksums.yaml +4 -4
  2. data/lib/base.rb +13 -5
  3. data/lib/bots.rb +5 -0
  4. data/lib/scraper.rb +145 -0
  5. metadata +83 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: be4c3337063af2d514fd1016c94d66385fa84dbde27dbfa5174d5b0908c9efb1
4
- data.tar.gz: 13ef3dccd336269fdb1960f243c8b7dd08abf782ba89cfbf935c4e802a29ca45
3
+ metadata.gz: 3a8018e0d8575a415699c41dcba236e3c4f400e8132111093e421ac02e792548
4
+ data.tar.gz: 4b876044081e94743d1b719c53424331d44bb200a8bdcfddd7d78562209eeed3
5
5
  SHA512:
6
- metadata.gz: 354704a255a80def04f1993d3a73d5b358fedfeeda387278f47ab995ed10d2afced94fe175a15bc8dd7c55b1209798598b55a5f2d2071424f11613091cd87813
7
- data.tar.gz: c1fae508d2b55269f0039524ee4e87714d7cc7244922faa83ac5b6c374abc29cdaab34355059ff92e5d6a0469be44900ac628fcfe55da80def3993549a2bcf27
6
+ metadata.gz: 326e82a582132f2d267e906df73aad0d812f9fc4fe00c8af2ba9ef6cd93a174ae2004719ec76f5cc6b018da0eda1b2cd891dd51f7df7adaad6044da773f207ec
7
+ data.tar.gz: 2a4944c21854faee39f81b81004fb63baf3f270b2374f081b98ca25b4f7695af80184b1449dff38207d4c21b7f34b524d59c997931f2db2743df42c56a714875
data/lib/base.rb CHANGED
@@ -9,11 +9,13 @@ module BlackStack
9
9
 
10
10
  def initialize(h)
11
11
  # array of numbers from 4000 to 4249
12
- unless h[:proxy].nil?
13
- self.ip = h[:proxy][:ip]
14
- self.user = h[:proxy][:user]
15
- self.password = h[:proxy][:password]
16
- self.ports = (h[:proxy][:port_from]..h[:proxy][:port_to]).to_a
12
+ if h
13
+ self.ip = h[:ip]
14
+ self.user = h[:user]
15
+ self.password = h[:password]
16
+ self.ports = (h[:port_from]..h[:port_to]).to_a
17
+ else
18
+ self.ports = []
17
19
  end
18
20
  self.port_index = -1
19
21
  end # initialize
@@ -26,10 +28,16 @@ module BlackStack
26
28
 
27
29
  class MechanizeBot < BlackStack::Bots::Bot
28
30
  attr_accessor :agent # mechanize agent
31
+ def initialize(h)
32
+ super(h)
33
+ end
29
34
  end # MechanizeBot
30
35
 
31
36
  class SeleniumBot < BlackStack::Bots::Bot
32
37
  attr_accessor :driver # selenium driver
38
+ def initialize(h)
39
+ super(h)
40
+ end
33
41
  end # MechanizeBot
34
42
 
35
43
  end # Bots
data/lib/bots.rb CHANGED
@@ -1,9 +1,14 @@
1
+ require 'open-uri'
1
2
  require 'mechanize'
2
3
  require 'selenium-webdriver'
3
4
  require 'simple_cloud_logging'
4
5
  require 'colorize'
5
6
  require 'csv'
7
+ require 'pry'
8
+ require 'sitemap-parser'
9
+ require 'timeout'
6
10
 
7
11
  require_relative './base'
8
12
  require_relative './google'
13
+ require_relative './scraper'
9
14
  require_relative './indeed'
data/lib/scraper.rb ADDED
@@ -0,0 +1,145 @@
1
+ module BlackStack
2
+ module Bots
3
+ class Scraper < BlackStack::Bots::MechanizeBot
4
+ attr_accessor :domain, :links
5
+ # auxiliar array of links that I have extracted links from
6
+ attr_accessor :links_processed
7
+
8
+ def initialize(init_domain, h)
9
+ super(h)
10
+ self.domain = init_domain
11
+ #self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
12
+ self.links = []
13
+ self.links_processed = []
14
+ end # def initialize
15
+
16
+ def get(url)
17
+ # initialize mechanize agent
18
+ self.agent = Mechanize.new
19
+ # set a proxy with user and password
20
+ self.port_index += 1
21
+ self.port_index = 0 if self.port_index >= self.ports.length
22
+ self.agent.set_proxy(self.ip, self.ports[self.port_index], self.user, self.password) if self.proxy?
23
+ self.agent.open_timeout = 5
24
+ self.agent.read_timeout = 5
25
+ # return
26
+ return Timeout::timeout(5) { self.agent.get(url) }
27
+ end
28
+
29
+ def get_links_from_sitemap(l=nil)
30
+ i = 0
31
+ l.logs "Scrape sitemaps... "
32
+ begin
33
+ # download the robots.txt
34
+ url = "http://#{domain}/robots.txt"
35
+ # get the content of robots.txt from url
36
+ s = Timeout::timeout(5) { URI.open(url).read }
37
+ # get the sitemap
38
+ sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
39
+ sitemaps.each { |b|
40
+ parser = Timeout::timeout(5) { SitemapParser.new b }
41
+ self.links += Timeout::timeout(5) { parser.to_a }
42
+ self.links.uniq!
43
+ }
44
+ l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
45
+ rescue => e
46
+ l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
47
+ end
48
+ end
49
+
50
+ # internal use only
51
+ def get_links_from_url(url, l=nil)
52
+ l = BlackStack::DummyLogger.new(nil) if l.nil?
53
+ l.logs "get_links (#{url})... "
54
+ begin
55
+ aux = []
56
+ # trim url
57
+ url = url.strip
58
+ # get domain of the url using open-uri
59
+ domain = URI.parse(url).host
60
+ # visit the main page of the website
61
+ page = self.get(url)
62
+ # get the self.links to the pages of the website
63
+ aux = page.links.map(&:href)
64
+ # remove non-string elements
65
+ aux = aux.select { |link| link.is_a?(String) }
66
+ # remove # from the self.links
67
+ aux = aux.map { |link| !link.nil? && link.split('#').first }
68
+ # remove querystring from the self.links
69
+ aux = aux.map { |link| !link.nil? && link.split('?').first }
70
+ # remove the self.links that are not http:// or https://
71
+ aux = aux.select { |link| !link.nil? && link =~ /^https?:\/\// }
72
+ # remove the self.links that are not from the same domain
73
+ aux = aux.select { |link| !link.nil? && link =~ /#{domain}/ }
74
+ # remove nil values
75
+ aux = aux.compact
76
+ # remove duplications
77
+ aux = aux.uniq
78
+ # filter links who already are in the list
79
+ a = aux.size
80
+ aux = aux.select { |link| !self.links.include?(link) }
81
+ b = aux.size
82
+ # add new links to self.links
83
+ self.links += aux
84
+ l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
85
+ rescue => e
86
+ l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
87
+ end
88
+ end # def get_links_from_url
89
+
90
+ def get_links(stop_at=10, l=nil)
91
+ l = BlackStack::DummyLogger.new(nil) if l.nil?
92
+ # working with root url
93
+ url = "http://#{self.domain}/"
94
+ self.links << url if self.links.select { |link| link == url }.empty?
95
+ # iterate until I have discovered all the links
96
+ while self.links.size != self.links_processed.size && stop_at >= self.links.size
97
+ # iterate the links who are not in links_processed
98
+ self.links.select { |link| !self.links_processed.include?(link) }.each { |link|
99
+ # get the links from the url
100
+ self.get_links_from_url(link, l)
101
+ # add the link to the list of processed links
102
+ self.links_processed << link
103
+ }
104
+ end # while
105
+ # get links from the sitemap
106
+ self.get_links_from_sitemap(l)
107
+ end # def get_links
108
+
109
+ def find_keywords(a, stop_at=50, l=nil)
110
+ ret = []
111
+ l = BlackStack::DummyLogger.new(nil) if l.nil?
112
+ # iterate the links
113
+ j = 0
114
+ self.links.reject { |link| link =~ /\.pdf$/i || link =~ /\.jpg$/i || link =~ /\.jpeg$/i || link =~ /\.gif$/i }.each { |link|
115
+ j += 1
116
+ break if j > stop_at
117
+ l.logs "#{j.to_s}. find_keywords (#{link})... "
118
+ begin
119
+ # get the page
120
+ page = self.get(link)
121
+ # get page body content in plain text
122
+ s = Timeout::timeout(5) { Nokogiri::HTML(page.body).text }
123
+ # iterate the keywords
124
+ i = 0
125
+ a.each { |k|
126
+ # find the keyword
127
+ if s =~ /#{Regexp.escape(k)}/i
128
+ i += 1
129
+ ret << link if ret.select { |link| link == link }.empty?
130
+ break
131
+ end # if
132
+ } # each
133
+ break if ret.size > 0
134
+ l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
135
+ rescue => e
136
+ l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
137
+ end # begin
138
+ } # each
139
+ # return
140
+ ret
141
+ end
142
+
143
+ end # class Scraper
144
+ end # module Bots
145
+ end # module BlackStack
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bots
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leandro Daniel Sardi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-08 00:00:00.000000000 Z
11
+ date: 2023-08-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: simple_cloud_logging
@@ -110,6 +110,86 @@ dependencies:
110
110
  - - ">="
111
111
  - !ruby/object:Gem::Version
112
112
  version: 0.8.1
113
+ - !ruby/object:Gem::Dependency
114
+ name: pry
115
+ requirement: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - "~>"
118
+ - !ruby/object:Gem::Version
119
+ version: 0.14.2
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: 0.14.2
123
+ type: :runtime
124
+ prerelease: false
125
+ version_requirements: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - "~>"
128
+ - !ruby/object:Gem::Version
129
+ version: 0.14.2
130
+ - - ">="
131
+ - !ruby/object:Gem::Version
132
+ version: 0.14.2
133
+ - !ruby/object:Gem::Dependency
134
+ name: open-uri
135
+ requirement: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - "~>"
138
+ - !ruby/object:Gem::Version
139
+ version: 0.2.0
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ version: 0.2.0
143
+ type: :runtime
144
+ prerelease: false
145
+ version_requirements: !ruby/object:Gem::Requirement
146
+ requirements:
147
+ - - "~>"
148
+ - !ruby/object:Gem::Version
149
+ version: 0.2.0
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: 0.2.0
153
+ - !ruby/object:Gem::Dependency
154
+ name: sitemap-parser
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: 0.5.6
160
+ - - ">="
161
+ - !ruby/object:Gem::Version
162
+ version: 0.5.6
163
+ type: :runtime
164
+ prerelease: false
165
+ version_requirements: !ruby/object:Gem::Requirement
166
+ requirements:
167
+ - - "~>"
168
+ - !ruby/object:Gem::Version
169
+ version: 0.5.6
170
+ - - ">="
171
+ - !ruby/object:Gem::Version
172
+ version: 0.5.6
173
+ - !ruby/object:Gem::Dependency
174
+ name: timeout
175
+ requirement: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - "~>"
178
+ - !ruby/object:Gem::Version
179
+ version: 0.4.0
180
+ - - ">="
181
+ - !ruby/object:Gem::Version
182
+ version: 0.4.0
183
+ type: :runtime
184
+ prerelease: false
185
+ version_requirements: !ruby/object:Gem::Requirement
186
+ requirements:
187
+ - - "~>"
188
+ - !ruby/object:Gem::Version
189
+ version: 0.4.0
190
+ - - ">="
191
+ - !ruby/object:Gem::Version
192
+ version: 0.4.0
113
193
  description: Ruby gem for scraping information from the public web.
114
194
  email: leandro@connectionsphere.com
115
195
  executables: []
@@ -120,6 +200,7 @@ files:
120
200
  - lib/bots.rb
121
201
  - lib/google.rb
122
202
  - lib/indeed.rb
203
+ - lib/scraper.rb
123
204
  homepage: https://rubygems.org/gems/bots
124
205
  licenses:
125
206
  - MIT