bots 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/scraper.rb +18 -8
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5d048bca926b971212391dd9405a1d82d1f3afa563afcbe0db65af2c754b42df
4
- data.tar.gz: 2eb8ecbc2e1cec7d69039a640104209df71e1c7882ec32cfd8304048e10a0126
3
+ metadata.gz: ef6ff9fa026dd9e1f1c750b5963c957a4deacf5413f92a3dc8fd1f0584305898
4
+ data.tar.gz: e652c088971c2fd9c97c12902d09ce7ed4933da36da6e010f2f3ffae1abdb292
5
5
  SHA512:
6
- metadata.gz: aefad15e842214027526baf6aeff07c584e064fbd40f8c31e9a71c875ea24acc179cd36c6f963839e4cfef0bd9ef775332db439f3e9b8a336b1235ee81c5a6fc
7
- data.tar.gz: 07461d5bb58adc219acfad52232d9199a8944b797c5176e63eb90316060aaa05e215ba27d1c86c659fcf92f6a3204407b71274d69e3361ef9655ea0989fe6592
6
+ metadata.gz: 1c8fce3accf4fc84701706355880bd59cc1cd5a16dfbc4a05bbdb30faa492c7f212f4a22ef917b81e7b8ba5f0c6caaa911f23bef1df4f4f23868ab225673adff
7
+ data.tar.gz: 3942240b8cff2bdbfdb003912af8da5c26f6b117a3744becbff6a781f77f9eb32d2c5e0aef1439edc6b288ec0ff359f5e45fec68e21db8d6cca78c8d36cf04dc
data/lib/scraper.rb CHANGED
@@ -61,6 +61,8 @@ module BlackStack
61
61
  page = self.get(url)
62
62
  # get the self.links to the pages of the website
63
63
  aux = page.links.map(&:href)
64
+ # remove non-string elements
65
+ aux = aux.select { |link| link.is_a?(String) }
64
66
  # remove # from the self.links
65
67
  aux = aux.map { |link| !link.nil? && link.split('#').first }
66
68
  # remove querystring from the self.links
@@ -104,8 +106,8 @@ module BlackStack
104
106
  self.get_links_from_sitemap(l)
105
107
  end # def get_links
106
108
 
107
- def find_keywords(a, stop_at=50, l=nil)
108
- ret = []
109
+ def find_keywords(a, stop_at=50, stop_on_first_link_found=false, l=nil)
110
+ pages = []
109
111
  l = BlackStack::DummyLogger.new(nil) if l.nil?
110
112
  # iterate the links
111
113
  j = 0
@@ -117,25 +119,33 @@ module BlackStack
117
119
  # get the page
118
120
  page = self.get(link)
119
121
  # get page body content in plain text
120
- s = Timeout::timeout(5) { Nokogiri::HTML(page.body).text }
122
+ title = page.title
123
+ s = Timeout::timeout(5) { page.search('body').text }
124
+ # add the link to the results of no-keyword
125
+ hpage = { :url => link, :title => title, :html => page.body, :keywords => [] }
126
+ pages << hpage
121
127
  # iterate the keywords
122
128
  i = 0
129
+ match = false
123
130
  a.each { |k|
124
131
  # find the keyword
125
- if s =~ /#{Regexp.escape(k)}/i
132
+ match = ( s =~ /#{Regexp.escape(k)}/i )
133
+ hpage[:keywords] << k if match
134
+ # count the number of links with match
135
+ # break if only 1 link is needed
136
+ if match
126
137
  i += 1
127
- ret << link if ret.select { |link| link == link }.empty?
128
- break
138
+ break if stop_on_first_link_found
129
139
  end # if
130
140
  } # each
131
- break if ret.size > 0
141
+ break if match && stop_on_first_link_found
132
142
  l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
133
143
  rescue => e
134
144
  l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
135
145
  end # begin
136
146
  } # each
137
147
  # return
138
- ret
148
+ pages
139
149
  end
140
150
 
141
151
  end # class Scraper
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bots
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leandro Daniel Sardi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-15 00:00:00.000000000 Z
11
+ date: 2023-08-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: simple_cloud_logging