bots 1.0.3 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/scraper.rb +16 -8
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3a8018e0d8575a415699c41dcba236e3c4f400e8132111093e421ac02e792548
4
- data.tar.gz: 4b876044081e94743d1b719c53424331d44bb200a8bdcfddd7d78562209eeed3
3
+ metadata.gz: d58802b035984822506024bb8745293a9c6edf666c9ed4b69311282f226b079a
4
+ data.tar.gz: 250faa62467ee442ed198b36247f4d5a271dfc03f9614e0021357873bdf749a4
5
5
  SHA512:
6
- metadata.gz: 326e82a582132f2d267e906df73aad0d812f9fc4fe00c8af2ba9ef6cd93a174ae2004719ec76f5cc6b018da0eda1b2cd891dd51f7df7adaad6044da773f207ec
7
- data.tar.gz: 2a4944c21854faee39f81b81004fb63baf3f270b2374f081b98ca25b4f7695af80184b1449dff38207d4c21b7f34b524d59c997931f2db2743df42c56a714875
6
+ metadata.gz: ddb39e422226e1b490e18d364ae7ab15b6bb8c20425091f41168b32c59ef59809c769e7cf2d8628998676799a21b2cb7957aeb807a9bd1995e1658217822be65
7
+ data.tar.gz: f8431a7d719c30326d3c1dd69e8b80dd4f916b004c7a44b464c22d83fa5acce26cd59f0b94709b1955ff1dda6450ccd8bc7121b33368b91f92380e88a1f01e54
data/lib/scraper.rb CHANGED
@@ -106,8 +106,8 @@ module BlackStack
106
106
  self.get_links_from_sitemap(l)
107
107
  end # def get_links
108
108
 
109
- def find_keywords(a, stop_at=50, l=nil)
110
- ret = []
109
+ def find_keywords(a, stop_at=50, stop_on_first_link_found=false, l=nil)
110
+ pages = []
111
111
  l = BlackStack::DummyLogger.new(nil) if l.nil?
112
112
  # iterate the links
113
113
  j = 0
@@ -119,25 +119,33 @@ module BlackStack
119
119
  # get the page
120
120
  page = self.get(link)
121
121
  # get page body content in plain text
122
- s = Timeout::timeout(5) { Nokogiri::HTML(page.body).text }
122
+ title = page.title
123
+ s = Timeout::timeout(5) { page.search('body').text }
124
+ # add the link to the results of no-keyword
125
+ hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' => page.body, 'keywords' => [] }
126
+ pages << hpage
123
127
  # iterate the keywords
124
128
  i = 0
129
+ match = false
125
130
  a.each { |k|
126
131
  # find the keyword
127
- if s =~ /#{Regexp.escape(k)}/i
132
+ match = ( s =~ /#{Regexp.escape(k)}/i )
133
+ hpage['keywords'] << k if match
134
+ # count the number of links with match
135
+ # break if only 1 link is needed
136
+ if match
128
137
  i += 1
129
- ret << link if ret.select { |link| link == link }.empty?
130
- break
138
+ break if stop_on_first_link_found
131
139
  end # if
132
140
  } # each
133
- break if ret.size > 0
141
+ break if match && stop_on_first_link_found
134
142
  l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
135
143
  rescue => e
136
144
  l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
137
145
  end # begin
138
146
  } # each
139
147
  # return
140
- ret
148
+ pages
141
149
  end
142
150
 
143
151
  end # class Scraper
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bots
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 1.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leandro Daniel Sardi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-16 00:00:00.000000000 Z
11
+ date: 2023-08-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: simple_cloud_logging