bots 1.0.3 → 1.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/scraper.rb +16 -8
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3a8018e0d8575a415699c41dcba236e3c4f400e8132111093e421ac02e792548
4
- data.tar.gz: 4b876044081e94743d1b719c53424331d44bb200a8bdcfddd7d78562209eeed3
3
+ metadata.gz: ef6ff9fa026dd9e1f1c750b5963c957a4deacf5413f92a3dc8fd1f0584305898
4
+ data.tar.gz: e652c088971c2fd9c97c12902d09ce7ed4933da36da6e010f2f3ffae1abdb292
5
5
  SHA512:
6
- metadata.gz: 326e82a582132f2d267e906df73aad0d812f9fc4fe00c8af2ba9ef6cd93a174ae2004719ec76f5cc6b018da0eda1b2cd891dd51f7df7adaad6044da773f207ec
7
- data.tar.gz: 2a4944c21854faee39f81b81004fb63baf3f270b2374f081b98ca25b4f7695af80184b1449dff38207d4c21b7f34b524d59c997931f2db2743df42c56a714875
6
+ metadata.gz: 1c8fce3accf4fc84701706355880bd59cc1cd5a16dfbc4a05bbdb30faa492c7f212f4a22ef917b81e7b8ba5f0c6caaa911f23bef1df4f4f23868ab225673adff
7
+ data.tar.gz: 3942240b8cff2bdbfdb003912af8da5c26f6b117a3744becbff6a781f77f9eb32d2c5e0aef1439edc6b288ec0ff359f5e45fec68e21db8d6cca78c8d36cf04dc
data/lib/scraper.rb CHANGED
@@ -106,8 +106,8 @@ module BlackStack
106
106
  self.get_links_from_sitemap(l)
107
107
  end # def get_links
108
108
 
109
- def find_keywords(a, stop_at=50, l=nil)
110
- ret = []
109
+ def find_keywords(a, stop_at=50, stop_on_first_link_found=false, l=nil)
110
+ pages = []
111
111
  l = BlackStack::DummyLogger.new(nil) if l.nil?
112
112
  # iterate the links
113
113
  j = 0
@@ -119,25 +119,33 @@ module BlackStack
119
119
  # get the page
120
120
  page = self.get(link)
121
121
  # get page body content in plain text
122
- s = Timeout::timeout(5) { Nokogiri::HTML(page.body).text }
122
+ title = page.title
123
+ s = Timeout::timeout(5) { page.search('body').text }
124
+ # add the link to the results of no-keyword
125
+ hpage = { :url => link, :title => title, :html => page.body, :keywords => [] }
126
+ pages << hpage
123
127
  # iterate the keywords
124
128
  i = 0
129
+ match = false
125
130
  a.each { |k|
126
131
  # find the keyword
127
- if s =~ /#{Regexp.escape(k)}/i
132
+ match = ( s =~ /#{Regexp.escape(k)}/i )
133
+ hpage[:keywords] << k if match
134
+ # count the number of links with match
135
+ # break if only 1 link is needed
136
+ if match
128
137
  i += 1
129
- ret << link if ret.select { |link| link == link }.empty?
130
- break
138
+ break if stop_on_first_link_found
131
139
  end # if
132
140
  } # each
133
- break if ret.size > 0
141
+ break if match && stop_on_first_link_found
134
142
  l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
135
143
  rescue => e
136
144
  l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
137
145
  end # begin
138
146
  } # each
139
147
  # return
140
- ret
148
+ pages
141
149
  end
142
150
 
143
151
  end # class Scraper
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bots
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 1.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leandro Daniel Sardi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-16 00:00:00.000000000 Z
11
+ date: 2023-08-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: simple_cloud_logging