bots 1.0.3 → 1.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/scraper.rb +16 -8
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d58802b035984822506024bb8745293a9c6edf666c9ed4b69311282f226b079a
|
4
|
+
data.tar.gz: 250faa62467ee442ed198b36247f4d5a271dfc03f9614e0021357873bdf749a4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ddb39e422226e1b490e18d364ae7ab15b6bb8c20425091f41168b32c59ef59809c769e7cf2d8628998676799a21b2cb7957aeb807a9bd1995e1658217822be65
|
7
|
+
data.tar.gz: f8431a7d719c30326d3c1dd69e8b80dd4f916b004c7a44b464c22d83fa5acce26cd59f0b94709b1955ff1dda6450ccd8bc7121b33368b91f92380e88a1f01e54
|
data/lib/scraper.rb
CHANGED
@@ -106,8 +106,8 @@ module BlackStack
|
|
106
106
|
self.get_links_from_sitemap(l)
|
107
107
|
end # def get_links
|
108
108
|
|
109
|
-
def find_keywords(a, stop_at=50, l=nil)
|
110
|
-
|
109
|
+
def find_keywords(a, stop_at=50, stop_on_first_link_found=false, l=nil)
|
110
|
+
pages = []
|
111
111
|
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
112
112
|
# iterate the links
|
113
113
|
j = 0
|
@@ -119,25 +119,33 @@ module BlackStack
|
|
119
119
|
# get the page
|
120
120
|
page = self.get(link)
|
121
121
|
# get page body content in plain text
|
122
|
-
|
122
|
+
title = page.title
|
123
|
+
s = Timeout::timeout(5) { page.search('body').text }
|
124
|
+
# add the link to the results of no-keyword
|
125
|
+
hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' => page.body, 'keywords' => [] }
|
126
|
+
pages << hpage
|
123
127
|
# iterate the keywords
|
124
128
|
i = 0
|
129
|
+
match = false
|
125
130
|
a.each { |k|
|
126
131
|
# find the keyword
|
127
|
-
|
132
|
+
match = ( s =~ /#{Regexp.escape(k)}/i )
|
133
|
+
hpage['keywords'] << k if match
|
134
|
+
# count the number of links with match
|
135
|
+
# break if only 1 link is needed
|
136
|
+
if match
|
128
137
|
i += 1
|
129
|
-
|
130
|
-
break
|
138
|
+
break if stop_on_first_link_found
|
131
139
|
end # if
|
132
140
|
} # each
|
133
|
-
break if
|
141
|
+
break if match && stop_on_first_link_found
|
134
142
|
l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
|
135
143
|
rescue => e
|
136
144
|
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
137
145
|
end # begin
|
138
146
|
} # each
|
139
147
|
# return
|
140
|
-
|
148
|
+
pages
|
141
149
|
end
|
142
150
|
|
143
151
|
end # class Scraper
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Leandro Daniel Sardi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08-
|
11
|
+
date: 2023-08-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: simple_cloud_logging
|