bots 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (5) hide show
  1. checksums.yaml +4 -4
  2. data/lib/base.rb +13 -5
  3. data/lib/bots.rb +5 -0
  4. data/lib/scraper.rb +143 -0
  5. metadata +83 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: be4c3337063af2d514fd1016c94d66385fa84dbde27dbfa5174d5b0908c9efb1
4
- data.tar.gz: 13ef3dccd336269fdb1960f243c8b7dd08abf782ba89cfbf935c4e802a29ca45
3
+ metadata.gz: 5d048bca926b971212391dd9405a1d82d1f3afa563afcbe0db65af2c754b42df
4
+ data.tar.gz: 2eb8ecbc2e1cec7d69039a640104209df71e1c7882ec32cfd8304048e10a0126
5
5
  SHA512:
6
- metadata.gz: 354704a255a80def04f1993d3a73d5b358fedfeeda387278f47ab995ed10d2afced94fe175a15bc8dd7c55b1209798598b55a5f2d2071424f11613091cd87813
7
- data.tar.gz: c1fae508d2b55269f0039524ee4e87714d7cc7244922faa83ac5b6c374abc29cdaab34355059ff92e5d6a0469be44900ac628fcfe55da80def3993549a2bcf27
6
+ metadata.gz: aefad15e842214027526baf6aeff07c584e064fbd40f8c31e9a71c875ea24acc179cd36c6f963839e4cfef0bd9ef775332db439f3e9b8a336b1235ee81c5a6fc
7
+ data.tar.gz: 07461d5bb58adc219acfad52232d9199a8944b797c5176e63eb90316060aaa05e215ba27d1c86c659fcf92f6a3204407b71274d69e3361ef9655ea0989fe6592
data/lib/base.rb CHANGED
@@ -9,11 +9,13 @@ module BlackStack
9
9
 
10
10
  def initialize(h)
11
11
  # array of numbers from 4000 to 4249
12
- unless h[:proxy].nil?
13
- self.ip = h[:proxy][:ip]
14
- self.user = h[:proxy][:user]
15
- self.password = h[:proxy][:password]
16
- self.ports = (h[:proxy][:port_from]..h[:proxy][:port_to]).to_a
12
+ if h
13
+ self.ip = h[:ip]
14
+ self.user = h[:user]
15
+ self.password = h[:password]
16
+ self.ports = (h[:port_from]..h[:port_to]).to_a
17
+ else
18
+ self.ports = []
17
19
  end
18
20
  self.port_index = -1
19
21
  end # initialize
@@ -26,10 +28,16 @@ module BlackStack
26
28
 
27
29
  class MechanizeBot < BlackStack::Bots::Bot
28
30
  attr_accessor :agent # mechanize agent
31
+ def initialize(h)
32
+ super(h)
33
+ end
29
34
  end # MechanizeBot
30
35
 
31
36
  class SeleniumBot < BlackStack::Bots::Bot
32
37
  attr_accessor :driver # selenium driver
38
+ def initialize(h)
39
+ super(h)
40
+ end
33
41
  end # MechanizeBot
34
42
 
35
43
  end # Bots
data/lib/bots.rb CHANGED
@@ -1,9 +1,14 @@
1
+ require 'open-uri'
1
2
  require 'mechanize'
2
3
  require 'selenium-webdriver'
3
4
  require 'simple_cloud_logging'
4
5
  require 'colorize'
5
6
  require 'csv'
7
+ require 'pry'
8
+ require 'sitemap-parser'
9
+ require 'timeout'
6
10
 
7
11
  require_relative './base'
8
12
  require_relative './google'
13
+ require_relative './scraper'
9
14
  require_relative './indeed'
data/lib/scraper.rb ADDED
@@ -0,0 +1,143 @@
1
+ module BlackStack
2
+ module Bots
3
+ class Scraper < BlackStack::Bots::MechanizeBot
4
+ attr_accessor :domain, :links
5
+ # auxiliar array of links that I have extracted links from
6
+ attr_accessor :links_processed
7
+
8
+ def initialize(init_domain, h)
9
+ super(h)
10
+ self.domain = init_domain
11
+ #self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
12
+ self.links = []
13
+ self.links_processed = []
14
+ end # def initialize
15
+
16
+ def get(url)
17
+ # initialize mechanize agent
18
+ self.agent = Mechanize.new
19
+ # set a proxy with user and password
20
+ self.port_index += 1
21
+ self.port_index = 0 if self.port_index >= self.ports.length
22
+ self.agent.set_proxy(self.ip, self.ports[self.port_index], self.user, self.password) if self.proxy?
23
+ self.agent.open_timeout = 5
24
+ self.agent.read_timeout = 5
25
+ # return
26
+ return Timeout::timeout(5) { self.agent.get(url) }
27
+ end
28
+
29
+ def get_links_from_sitemap(l=nil)
30
+ i = 0
31
+ l.logs "Scrape sitemaps... "
32
+ begin
33
+ # download the robots.txt
34
+ url = "http://#{domain}/robots.txt"
35
+ # get the content of robots.txt from url
36
+ s = Timeout::timeout(5) { URI.open(url).read }
37
+ # get the sitemap
38
+ sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
39
+ sitemaps.each { |b|
40
+ parser = Timeout::timeout(5) { SitemapParser.new b }
41
+ self.links += Timeout::timeout(5) { parser.to_a }
42
+ self.links.uniq!
43
+ }
44
+ l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
45
+ rescue => e
46
+ l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
47
+ end
48
+ end
49
+
50
+ # internal use only
51
+ def get_links_from_url(url, l=nil)
52
+ l = BlackStack::DummyLogger.new(nil) if l.nil?
53
+ l.logs "get_links (#{url})... "
54
+ begin
55
+ aux = []
56
+ # trim url
57
+ url = url.strip
58
+ # get domain of the url using open-uri
59
+ domain = URI.parse(url).host
60
+ # visit the main page of the website
61
+ page = self.get(url)
62
+ # get the self.links to the pages of the website
63
+ aux = page.links.map(&:href)
64
+ # remove # from the self.links
65
+ aux = aux.map { |link| !link.nil? && link.split('#').first }
66
+ # remove querystring from the self.links
67
+ aux = aux.map { |link| !link.nil? && link.split('?').first }
68
+ # remove the self.links that are not http:// or https://
69
+ aux = aux.select { |link| !link.nil? && link =~ /^https?:\/\// }
70
+ # remove the self.links that are not from the same domain
71
+ aux = aux.select { |link| !link.nil? && link =~ /#{domain}/ }
72
+ # remove nil values
73
+ aux = aux.compact
74
+ # remove duplications
75
+ aux = aux.uniq
76
+ # filter links who already are in the list
77
+ a = aux.size
78
+ aux = aux.select { |link| !self.links.include?(link) }
79
+ b = aux.size
80
+ # add new links to self.links
81
+ self.links += aux
82
+ l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
83
+ rescue => e
84
+ l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
85
+ end
86
+ end # def get_links_from_url
87
+
88
+ def get_links(stop_at=10, l=nil)
89
+ l = BlackStack::DummyLogger.new(nil) if l.nil?
90
+ # working with root url
91
+ url = "http://#{self.domain}/"
92
+ self.links << url if self.links.select { |link| link == url }.empty?
93
+ # iterate until I have discovered all the links
94
+ while self.links.size != self.links_processed.size && stop_at >= self.links.size
95
+ # iterate the links who are not in links_processed
96
+ self.links.select { |link| !self.links_processed.include?(link) }.each { |link|
97
+ # get the links from the url
98
+ self.get_links_from_url(link, l)
99
+ # add the link to the list of processed links
100
+ self.links_processed << link
101
+ }
102
+ end # while
103
+ # get links from the sitemap
104
+ self.get_links_from_sitemap(l)
105
+ end # def get_links
106
+
107
+ def find_keywords(a, stop_at=50, l=nil)
108
+ ret = []
109
+ l = BlackStack::DummyLogger.new(nil) if l.nil?
110
+ # iterate the links
111
+ j = 0
112
+ self.links.reject { |link| link =~ /\.pdf$/i || link =~ /\.jpg$/i || link =~ /\.jpeg$/i || link =~ /\.gif$/i }.each { |link|
113
+ j += 1
114
+ break if j > stop_at
115
+ l.logs "#{j.to_s}. find_keywords (#{link})... "
116
+ begin
117
+ # get the page
118
+ page = self.get(link)
119
+ # get page body content in plain text
120
+ s = Timeout::timeout(5) { Nokogiri::HTML(page.body).text }
121
+ # iterate the keywords
122
+ i = 0
123
+ a.each { |k|
124
+ # find the keyword
125
+ if s =~ /#{Regexp.escape(k)}/i
126
+ i += 1
127
+ ret << link if ret.select { |link| link == link }.empty?
128
+ break
129
+ end # if
130
+ } # each
131
+ break if ret.size > 0
132
+ l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
133
+ rescue => e
134
+ l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
135
+ end # begin
136
+ } # each
137
+ # return
138
+ ret
139
+ end
140
+
141
+ end # class Scraper
142
+ end # module Bots
143
+ end # module BlackStack
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bots
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leandro Daniel Sardi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-08 00:00:00.000000000 Z
11
+ date: 2023-08-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: simple_cloud_logging
@@ -110,6 +110,86 @@ dependencies:
110
110
  - - ">="
111
111
  - !ruby/object:Gem::Version
112
112
  version: 0.8.1
113
+ - !ruby/object:Gem::Dependency
114
+ name: pry
115
+ requirement: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - "~>"
118
+ - !ruby/object:Gem::Version
119
+ version: 0.14.2
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: 0.14.2
123
+ type: :runtime
124
+ prerelease: false
125
+ version_requirements: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - "~>"
128
+ - !ruby/object:Gem::Version
129
+ version: 0.14.2
130
+ - - ">="
131
+ - !ruby/object:Gem::Version
132
+ version: 0.14.2
133
+ - !ruby/object:Gem::Dependency
134
+ name: open-uri
135
+ requirement: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - "~>"
138
+ - !ruby/object:Gem::Version
139
+ version: 0.2.0
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ version: 0.2.0
143
+ type: :runtime
144
+ prerelease: false
145
+ version_requirements: !ruby/object:Gem::Requirement
146
+ requirements:
147
+ - - "~>"
148
+ - !ruby/object:Gem::Version
149
+ version: 0.2.0
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: 0.2.0
153
+ - !ruby/object:Gem::Dependency
154
+ name: sitemap-parser
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: 0.5.6
160
+ - - ">="
161
+ - !ruby/object:Gem::Version
162
+ version: 0.5.6
163
+ type: :runtime
164
+ prerelease: false
165
+ version_requirements: !ruby/object:Gem::Requirement
166
+ requirements:
167
+ - - "~>"
168
+ - !ruby/object:Gem::Version
169
+ version: 0.5.6
170
+ - - ">="
171
+ - !ruby/object:Gem::Version
172
+ version: 0.5.6
173
+ - !ruby/object:Gem::Dependency
174
+ name: timeout
175
+ requirement: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - "~>"
178
+ - !ruby/object:Gem::Version
179
+ version: 0.4.0
180
+ - - ">="
181
+ - !ruby/object:Gem::Version
182
+ version: 0.4.0
183
+ type: :runtime
184
+ prerelease: false
185
+ version_requirements: !ruby/object:Gem::Requirement
186
+ requirements:
187
+ - - "~>"
188
+ - !ruby/object:Gem::Version
189
+ version: 0.4.0
190
+ - - ">="
191
+ - !ruby/object:Gem::Version
192
+ version: 0.4.0
113
193
  description: Ruby gem for scraping information from the public web.
114
194
  email: leandro@connectionsphere.com
115
195
  executables: []
@@ -120,6 +200,7 @@ files:
120
200
  - lib/bots.rb
121
201
  - lib/google.rb
122
202
  - lib/indeed.rb
203
+ - lib/scraper.rb
123
204
  homepage: https://rubygems.org/gems/bots
124
205
  licenses:
125
206
  - MIT