bots 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/scraper.rb +18 -8
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ef6ff9fa026dd9e1f1c750b5963c957a4deacf5413f92a3dc8fd1f0584305898
|
4
|
+
data.tar.gz: e652c088971c2fd9c97c12902d09ce7ed4933da36da6e010f2f3ffae1abdb292
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1c8fce3accf4fc84701706355880bd59cc1cd5a16dfbc4a05bbdb30faa492c7f212f4a22ef917b81e7b8ba5f0c6caaa911f23bef1df4f4f23868ab225673adff
|
7
|
+
data.tar.gz: 3942240b8cff2bdbfdb003912af8da5c26f6b117a3744becbff6a781f77f9eb32d2c5e0aef1439edc6b288ec0ff359f5e45fec68e21db8d6cca78c8d36cf04dc
|
data/lib/scraper.rb
CHANGED
@@ -61,6 +61,8 @@ module BlackStack
|
|
61
61
|
page = self.get(url)
|
62
62
|
# get the self.links to the pages of the website
|
63
63
|
aux = page.links.map(&:href)
|
64
|
+
# remove non-string elements
|
65
|
+
aux = aux.select { |link| link.is_a?(String) }
|
64
66
|
# remove # from the self.links
|
65
67
|
aux = aux.map { |link| !link.nil? && link.split('#').first }
|
66
68
|
# remove querystring from the self.links
|
@@ -104,8 +106,8 @@ module BlackStack
|
|
104
106
|
self.get_links_from_sitemap(l)
|
105
107
|
end # def get_links
|
106
108
|
|
107
|
-
def find_keywords(a, stop_at=50, l=nil)
|
108
|
-
|
109
|
+
def find_keywords(a, stop_at=50, stop_on_first_link_found=false, l=nil)
|
110
|
+
pages = []
|
109
111
|
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
110
112
|
# iterate the links
|
111
113
|
j = 0
|
@@ -117,25 +119,33 @@ module BlackStack
|
|
117
119
|
# get the page
|
118
120
|
page = self.get(link)
|
119
121
|
# get page body content in plain text
|
120
|
-
|
122
|
+
title = page.title
|
123
|
+
s = Timeout::timeout(5) { page.search('body').text }
|
124
|
+
# add the link to the results of no-keyword
|
125
|
+
hpage = { :url => link, :title => title, :html => page.body, :keywords => [] }
|
126
|
+
pages << hpage
|
121
127
|
# iterate the keywords
|
122
128
|
i = 0
|
129
|
+
match = false
|
123
130
|
a.each { |k|
|
124
131
|
# find the keyword
|
125
|
-
|
132
|
+
match = ( s =~ /#{Regexp.escape(k)}/i )
|
133
|
+
hpage[:keywords] << k if match
|
134
|
+
# count the number of links with match
|
135
|
+
# break if only 1 link is needed
|
136
|
+
if match
|
126
137
|
i += 1
|
127
|
-
|
128
|
-
break
|
138
|
+
break if stop_on_first_link_found
|
129
139
|
end # if
|
130
140
|
} # each
|
131
|
-
break if
|
141
|
+
break if match && stop_on_first_link_found
|
132
142
|
l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
|
133
143
|
rescue => e
|
134
144
|
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
135
145
|
end # begin
|
136
146
|
} # each
|
137
147
|
# return
|
138
|
-
|
148
|
+
pages
|
139
149
|
end
|
140
150
|
|
141
151
|
end # class Scraper
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Leandro Daniel Sardi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08-
|
11
|
+
date: 2023-08-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: simple_cloud_logging
|