bots 1.0.3 → 1.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/scraper.rb +16 -8
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3a8018e0d8575a415699c41dcba236e3c4f400e8132111093e421ac02e792548
4
- data.tar.gz: 4b876044081e94743d1b719c53424331d44bb200a8bdcfddd7d78562209eeed3
3
+ metadata.gz: d58802b035984822506024bb8745293a9c6edf666c9ed4b69311282f226b079a
4
+ data.tar.gz: 250faa62467ee442ed198b36247f4d5a271dfc03f9614e0021357873bdf749a4
5
5
  SHA512:
6
- metadata.gz: 326e82a582132f2d267e906df73aad0d812f9fc4fe00c8af2ba9ef6cd93a174ae2004719ec76f5cc6b018da0eda1b2cd891dd51f7df7adaad6044da773f207ec
7
- data.tar.gz: 2a4944c21854faee39f81b81004fb63baf3f270b2374f081b98ca25b4f7695af80184b1449dff38207d4c21b7f34b524d59c997931f2db2743df42c56a714875
6
+ metadata.gz: ddb39e422226e1b490e18d364ae7ab15b6bb8c20425091f41168b32c59ef59809c769e7cf2d8628998676799a21b2cb7957aeb807a9bd1995e1658217822be65
7
+ data.tar.gz: f8431a7d719c30326d3c1dd69e8b80dd4f916b004c7a44b464c22d83fa5acce26cd59f0b94709b1955ff1dda6450ccd8bc7121b33368b91f92380e88a1f01e54
data/lib/scraper.rb CHANGED
@@ -106,8 +106,8 @@ module BlackStack
106
106
  self.get_links_from_sitemap(l)
107
107
  end # def get_links
108
108
 
109
- def find_keywords(a, stop_at=50, l=nil)
110
- ret = []
109
+ def find_keywords(a, stop_at=50, stop_on_first_link_found=false, l=nil)
110
+ pages = []
111
111
  l = BlackStack::DummyLogger.new(nil) if l.nil?
112
112
  # iterate the links
113
113
  j = 0
@@ -119,25 +119,33 @@ module BlackStack
119
119
  # get the page
120
120
  page = self.get(link)
121
121
  # get page body content in plain text
122
- s = Timeout::timeout(5) { Nokogiri::HTML(page.body).text }
122
+ title = page.title
123
+ s = Timeout::timeout(5) { page.search('body').text }
124
+ # add the link to the results of no-keyword
125
+ hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' => page.body, 'keywords' => [] }
126
+ pages << hpage
123
127
  # iterate the keywords
124
128
  i = 0
129
+ match = false
125
130
  a.each { |k|
126
131
  # find the keyword
127
- if s =~ /#{Regexp.escape(k)}/i
132
+ match = ( s =~ /#{Regexp.escape(k)}/i )
133
+ hpage['keywords'] << k if match
134
+ # count the number of links with match
135
+ # break if only 1 link is needed
136
+ if match
128
137
  i += 1
129
- ret << link if ret.select { |link| link == link }.empty?
130
- break
138
+ break if stop_on_first_link_found
131
139
  end # if
132
140
  } # each
133
- break if ret.size > 0
141
+ break if match && stop_on_first_link_found
134
142
  l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
135
143
  rescue => e
136
144
  l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
137
145
  end # begin
138
146
  } # each
139
147
  # return
140
- ret
148
+ pages
141
149
  end
142
150
 
143
151
  end # class Scraper
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bots
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 1.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leandro Daniel Sardi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-16 00:00:00.000000000 Z
11
+ date: 2023-08-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: simple_cloud_logging