spider 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fa130209d8ff28f2cfb7c42ec1915ca39b5a0895e568882134a59b67757c432d
4
- data.tar.gz: 424d6d26fd66353515cbc7407fb0c5554a3566f19da0df526940c97025f4ef2c
3
+ metadata.gz: 7286f875f41881c9c8c385987c322b9832e5b07c6147aa4910a900e59015927e
4
+ data.tar.gz: 145ecd718a34521c17f0f9e939cb66f964ca0a1f93533969caf7d54426b213a8
5
5
  SHA512:
6
- metadata.gz: 686956d9960f6445e6f85dcf471b941154d232c7f9a0ff7c13008aed94f6a3a3a328e41abcbcbfa24bbedb5fe04bc481ec9ac31cbabf82806e5b7cb447075f79
7
- data.tar.gz: 94ea96cb7b1cb71777d4b74a42d868d853b4adbb08185be2035d6658e1e027374ce3d4cc75109dfea49ba8fb95b44d7ae5948f8d4b03a1027c780eac4023e29f
6
+ metadata.gz: ba771c7dbbe3df286475a5586ba2d2b63affe762b7bf504694e6865e39c8e7f5047811ac97285a1edc5d99cd478993fd2be9ee3ae244cc1690975f7ab3f0e779
7
+ data.tar.gz: 2bd17f25db36a267b5534ff663b8394e7439a1701970e6d23ea295818732133b23c4bb35f82aa1f6f90022edcaceafc47b3fee3f7902fd9028a2ec8ad697a2a4
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.6.0
1
+ 0.7.0
@@ -260,11 +260,34 @@ class SpiderInstance
260
260
  end
261
261
 
262
262
  def generate_next_urls(a_url, resp) #:nodoc:
263
+ # Only scan for links if the content-type is HTML or the URL ends with .html
264
+ content_type = resp['Content-Type'] || resp['content-type'] || ''
265
+ url_ends_with_html = a_url.downcase.end_with?('.html')
266
+
267
+ unless content_type.downcase.include?('text/html') || url_ends_with_html
268
+ return []
269
+ end
270
+
263
271
  web_page = resp.body
264
272
  base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
265
273
  [a_url[0,a_url.rindex('/')]])[0]
266
274
  base_url = remove_trailing_slash(base_url)
267
- web_page.scan(/href="(.*?)"/i).flatten.map do |link|
275
+
276
+ # Extract anchor tags with href attributes, respecting rel="nofollow"
277
+ web_page.scan(/<a\s[^>]*href="([^"]*)"[^>]*>/i).flatten.map do |link|
278
+ # Get the full anchor tag to check for rel attribute
279
+ anchor_match = web_page.match(/<a\s[^>]*href="#{Regexp.escape(link)}"[^>]*>/i)
280
+ next nil unless anchor_match
281
+
282
+ anchor_tag = anchor_match[0]
283
+
284
+ # Check if this link has rel="nofollow" or similar attributes that should be respected
285
+ if anchor_tag.match(/rel\s*=\s*["']([^"']*nofollow[^"']*)["']/i) ||
286
+ anchor_tag.match(/rel\s*=\s*["']([^"']*sponsored[^"']*)["']/i) ||
287
+ anchor_tag.match(/rel\s*=\s*["']([^"']*ugc[^"']*)["']/i)
288
+ next nil # Skip links with nofollow, sponsored, or ugc rel attributes
289
+ end
290
+
268
291
  begin
269
292
  parsed_link = URI.parse(link)
270
293
  if parsed_link.fragment == '#'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Nagro