bots 1.0.2 → 1.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/scraper.rb +18 -8
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ef6ff9fa026dd9e1f1c750b5963c957a4deacf5413f92a3dc8fd1f0584305898
|
4
|
+
data.tar.gz: e652c088971c2fd9c97c12902d09ce7ed4933da36da6e010f2f3ffae1abdb292
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1c8fce3accf4fc84701706355880bd59cc1cd5a16dfbc4a05bbdb30faa492c7f212f4a22ef917b81e7b8ba5f0c6caaa911f23bef1df4f4f23868ab225673adff
|
7
|
+
data.tar.gz: 3942240b8cff2bdbfdb003912af8da5c26f6b117a3744becbff6a781f77f9eb32d2c5e0aef1439edc6b288ec0ff359f5e45fec68e21db8d6cca78c8d36cf04dc
|
data/lib/scraper.rb
CHANGED
@@ -61,6 +61,8 @@ module BlackStack
|
|
61
61
|
page = self.get(url)
|
62
62
|
# get the self.links to the pages of the website
|
63
63
|
aux = page.links.map(&:href)
|
64
|
+
# remove non-string elements
|
65
|
+
aux = aux.select { |link| link.is_a?(String) }
|
64
66
|
# remove # from the self.links
|
65
67
|
aux = aux.map { |link| !link.nil? && link.split('#').first }
|
66
68
|
# remove querystring from the self.links
|
@@ -104,8 +106,8 @@ module BlackStack
|
|
104
106
|
self.get_links_from_sitemap(l)
|
105
107
|
end # def get_links
|
106
108
|
|
107
|
-
def find_keywords(a, stop_at=50, l=nil)
|
108
|
-
|
109
|
+
def find_keywords(a, stop_at=50, stop_on_first_link_found=false, l=nil)
|
110
|
+
pages = []
|
109
111
|
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
110
112
|
# iterate the links
|
111
113
|
j = 0
|
@@ -117,25 +119,33 @@ module BlackStack
|
|
117
119
|
# get the page
|
118
120
|
page = self.get(link)
|
119
121
|
# get page body content in plain text
|
120
|
-
|
122
|
+
title = page.title
|
123
|
+
s = Timeout::timeout(5) { page.search('body').text }
|
124
|
+
# add the link to the results of no-keyword
|
125
|
+
hpage = { :url => link, :title => title, :html => page.body, :keywords => [] }
|
126
|
+
pages << hpage
|
121
127
|
# iterate the keywords
|
122
128
|
i = 0
|
129
|
+
match = false
|
123
130
|
a.each { |k|
|
124
131
|
# find the keyword
|
125
|
-
|
132
|
+
match = ( s =~ /#{Regexp.escape(k)}/i )
|
133
|
+
hpage[:keywords] << k if match
|
134
|
+
# count the number of links with match
|
135
|
+
# break if only 1 link is needed
|
136
|
+
if match
|
126
137
|
i += 1
|
127
|
-
|
128
|
-
break
|
138
|
+
break if stop_on_first_link_found
|
129
139
|
end # if
|
130
140
|
} # each
|
131
|
-
break if
|
141
|
+
break if match && stop_on_first_link_found
|
132
142
|
l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
|
133
143
|
rescue => e
|
134
144
|
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
135
145
|
end # begin
|
136
146
|
} # each
|
137
147
|
# return
|
138
|
-
|
148
|
+
pages
|
139
149
|
end
|
140
150
|
|
141
151
|
end # class Scraper
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Leandro Daniel Sardi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08-
|
11
|
+
date: 2023-08-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: simple_cloud_logging
|