bots 1.0.2 → 1.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/scraper.rb +18 -8
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5d048bca926b971212391dd9405a1d82d1f3afa563afcbe0db65af2c754b42df
4
- data.tar.gz: 2eb8ecbc2e1cec7d69039a640104209df71e1c7882ec32cfd8304048e10a0126
3
+ metadata.gz: ef6ff9fa026dd9e1f1c750b5963c957a4deacf5413f92a3dc8fd1f0584305898
4
+ data.tar.gz: e652c088971c2fd9c97c12902d09ce7ed4933da36da6e010f2f3ffae1abdb292
5
5
  SHA512:
6
- metadata.gz: aefad15e842214027526baf6aeff07c584e064fbd40f8c31e9a71c875ea24acc179cd36c6f963839e4cfef0bd9ef775332db439f3e9b8a336b1235ee81c5a6fc
7
- data.tar.gz: 07461d5bb58adc219acfad52232d9199a8944b797c5176e63eb90316060aaa05e215ba27d1c86c659fcf92f6a3204407b71274d69e3361ef9655ea0989fe6592
6
+ metadata.gz: 1c8fce3accf4fc84701706355880bd59cc1cd5a16dfbc4a05bbdb30faa492c7f212f4a22ef917b81e7b8ba5f0c6caaa911f23bef1df4f4f23868ab225673adff
7
+ data.tar.gz: 3942240b8cff2bdbfdb003912af8da5c26f6b117a3744becbff6a781f77f9eb32d2c5e0aef1439edc6b288ec0ff359f5e45fec68e21db8d6cca78c8d36cf04dc
data/lib/scraper.rb CHANGED
@@ -61,6 +61,8 @@ module BlackStack
61
61
  page = self.get(url)
62
62
  # get the self.links to the pages of the website
63
63
  aux = page.links.map(&:href)
64
+ # remove non-string elements
65
+ aux = aux.select { |link| link.is_a?(String) }
64
66
  # remove # from the self.links
65
67
  aux = aux.map { |link| !link.nil? && link.split('#').first }
66
68
  # remove querystring from the self.links
@@ -104,8 +106,8 @@ module BlackStack
104
106
  self.get_links_from_sitemap(l)
105
107
  end # def get_links
106
108
 
107
- def find_keywords(a, stop_at=50, l=nil)
108
- ret = []
109
+ def find_keywords(a, stop_at=50, stop_on_first_link_found=false, l=nil)
110
+ pages = []
109
111
  l = BlackStack::DummyLogger.new(nil) if l.nil?
110
112
  # iterate the links
111
113
  j = 0
@@ -117,25 +119,33 @@ module BlackStack
117
119
  # get the page
118
120
  page = self.get(link)
119
121
  # get page body content in plain text
120
- s = Timeout::timeout(5) { Nokogiri::HTML(page.body).text }
122
+ title = page.title
123
+ s = Timeout::timeout(5) { page.search('body').text }
124
+ # add the link to the results of no-keyword
125
+ hpage = { :url => link, :title => title, :html => page.body, :keywords => [] }
126
+ pages << hpage
121
127
  # iterate the keywords
122
128
  i = 0
129
+ match = false
123
130
  a.each { |k|
124
131
  # find the keyword
125
- if s =~ /#{Regexp.escape(k)}/i
132
+ match = ( s =~ /#{Regexp.escape(k)}/i )
133
+ hpage[:keywords] << k if match
134
+ # count the number of links with match
135
+ # break if only 1 link is needed
136
+ if match
126
137
  i += 1
127
- ret << link if ret.select { |link| link == link }.empty?
128
- break
138
+ break if stop_on_first_link_found
129
139
  end # if
130
140
  } # each
131
- break if ret.size > 0
141
+ break if match && stop_on_first_link_found
132
142
  l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
133
143
  rescue => e
134
144
  l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
135
145
  end # begin
136
146
  } # each
137
147
  # return
138
- ret
148
+ pages
139
149
  end
140
150
 
141
151
  end # class Scraper
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bots
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leandro Daniel Sardi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-15 00:00:00.000000000 Z
11
+ date: 2023-08-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: simple_cloud_logging