bots 1.0.3 → 1.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/scraper.rb +16 -8
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ef6ff9fa026dd9e1f1c750b5963c957a4deacf5413f92a3dc8fd1f0584305898
|
4
|
+
data.tar.gz: e652c088971c2fd9c97c12902d09ce7ed4933da36da6e010f2f3ffae1abdb292
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1c8fce3accf4fc84701706355880bd59cc1cd5a16dfbc4a05bbdb30faa492c7f212f4a22ef917b81e7b8ba5f0c6caaa911f23bef1df4f4f23868ab225673adff
|
7
|
+
data.tar.gz: 3942240b8cff2bdbfdb003912af8da5c26f6b117a3744becbff6a781f77f9eb32d2c5e0aef1439edc6b288ec0ff359f5e45fec68e21db8d6cca78c8d36cf04dc
|
data/lib/scraper.rb
CHANGED
@@ -106,8 +106,8 @@ module BlackStack
|
|
106
106
|
self.get_links_from_sitemap(l)
|
107
107
|
end # def get_links
|
108
108
|
|
109
|
-
def find_keywords(a, stop_at=50, l=nil)
|
110
|
-
|
109
|
+
def find_keywords(a, stop_at=50, stop_on_first_link_found=false, l=nil)
|
110
|
+
pages = []
|
111
111
|
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
112
112
|
# iterate the links
|
113
113
|
j = 0
|
@@ -119,25 +119,33 @@ module BlackStack
|
|
119
119
|
# get the page
|
120
120
|
page = self.get(link)
|
121
121
|
# get page body content in plain text
|
122
|
-
|
122
|
+
title = page.title
|
123
|
+
s = Timeout::timeout(5) { page.search('body').text }
|
124
|
+
# add the link to the results of no-keyword
|
125
|
+
hpage = { :url => link, :title => title, :html => page.body, :keywords => [] }
|
126
|
+
pages << hpage
|
123
127
|
# iterate the keywords
|
124
128
|
i = 0
|
129
|
+
match = false
|
125
130
|
a.each { |k|
|
126
131
|
# find the keyword
|
127
|
-
|
132
|
+
match = ( s =~ /#{Regexp.escape(k)}/i )
|
133
|
+
hpage[:keywords] << k if match
|
134
|
+
# count the number of links with match
|
135
|
+
# break if only 1 link is needed
|
136
|
+
if match
|
128
137
|
i += 1
|
129
|
-
|
130
|
-
break
|
138
|
+
break if stop_on_first_link_found
|
131
139
|
end # if
|
132
140
|
} # each
|
133
|
-
break if
|
141
|
+
break if match && stop_on_first_link_found
|
134
142
|
l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
|
135
143
|
rescue => e
|
136
144
|
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
137
145
|
end # begin
|
138
146
|
} # each
|
139
147
|
# return
|
140
|
-
|
148
|
+
pages
|
141
149
|
end
|
142
150
|
|
143
151
|
end # class Scraper
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Leandro Daniel Sardi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08-
|
11
|
+
date: 2023-08-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: simple_cloud_logging
|