bots 1.0.3 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/scraper.rb +16 -8
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d58802b035984822506024bb8745293a9c6edf666c9ed4b69311282f226b079a
|
4
|
+
data.tar.gz: 250faa62467ee442ed198b36247f4d5a271dfc03f9614e0021357873bdf749a4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ddb39e422226e1b490e18d364ae7ab15b6bb8c20425091f41168b32c59ef59809c769e7cf2d8628998676799a21b2cb7957aeb807a9bd1995e1658217822be65
|
7
|
+
data.tar.gz: f8431a7d719c30326d3c1dd69e8b80dd4f916b004c7a44b464c22d83fa5acce26cd59f0b94709b1955ff1dda6450ccd8bc7121b33368b91f92380e88a1f01e54
|
data/lib/scraper.rb
CHANGED
@@ -106,8 +106,8 @@ module BlackStack
|
|
106
106
|
self.get_links_from_sitemap(l)
|
107
107
|
end # def get_links
|
108
108
|
|
109
|
-
def find_keywords(a, stop_at=50, l=nil)
|
110
|
-
|
109
|
+
def find_keywords(a, stop_at=50, stop_on_first_link_found=false, l=nil)
|
110
|
+
pages = []
|
111
111
|
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
112
112
|
# iterate the links
|
113
113
|
j = 0
|
@@ -119,25 +119,33 @@ module BlackStack
|
|
119
119
|
# get the page
|
120
120
|
page = self.get(link)
|
121
121
|
# get page body content in plain text
|
122
|
-
|
122
|
+
title = page.title
|
123
|
+
s = Timeout::timeout(5) { page.search('body').text }
|
124
|
+
# add the link to the results of no-keyword
|
125
|
+
hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' => page.body, 'keywords' => [] }
|
126
|
+
pages << hpage
|
123
127
|
# iterate the keywords
|
124
128
|
i = 0
|
129
|
+
match = false
|
125
130
|
a.each { |k|
|
126
131
|
# find the keyword
|
127
|
-
|
132
|
+
match = ( s =~ /#{Regexp.escape(k)}/i )
|
133
|
+
hpage['keywords'] << k if match
|
134
|
+
# count the number of links with match
|
135
|
+
# break if only 1 link is needed
|
136
|
+
if match
|
128
137
|
i += 1
|
129
|
-
|
130
|
-
break
|
138
|
+
break if stop_on_first_link_found
|
131
139
|
end # if
|
132
140
|
} # each
|
133
|
-
break if
|
141
|
+
break if match && stop_on_first_link_found
|
134
142
|
l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
|
135
143
|
rescue => e
|
136
144
|
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
137
145
|
end # begin
|
138
146
|
} # each
|
139
147
|
# return
|
140
|
-
|
148
|
+
pages
|
141
149
|
end
|
142
150
|
|
143
151
|
end # class Scraper
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Leandro Daniel Sardi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08-
|
11
|
+
date: 2023-08-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: simple_cloud_logging
|