url_common 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4f3290f0cba2dcd19ebc741320e0153f2ba5065f927914f850f956d675fe8752
4
- data.tar.gz: f93a6899d9b39729db16140698c35885c4b000710cc9b5e1dc635dc9a21467aa
3
+ metadata.gz: e75ba5e9d78301e4cf753f4ade0a54b036aaca17d7527791b984d8bc81215a5a
4
+ data.tar.gz: 7bae7598dd4884c9c795e511c32c02cca9095a32f23522e0241842631250488b
5
5
  SHA512:
6
- metadata.gz: a534233ebf72a903303eb4b273459ec717d81fb4acff7daafbfe57f368be7c44b06d9fd1773561506c0b1957b1eac61e5012ff9d582743245b100009e8feaa72
7
- data.tar.gz: 5b2efc559ad70767383a9fd35a458ef524713bb12c1c1c0c3393527e98a7ccde3ab32480793a2a91cd23c7bbbefffaaa3e16f12ca7656c75f7920621ceb9fc37
6
+ metadata.gz: d633158a6b206eca935948a72ebcce4bf90a196c373eca02ae9895819f5b027333b60e4dc5ec84f7b6507d4f468ac1752d36ca367abae4b03b65fd6665857095
7
+ data.tar.gz: a48dc2eace4656f5b2e16957174ae399633d97ddcff830d799ce31dee94ebfcbe648a4e3a1eb78c0ea8a91f63d8df22df783eed7f99d21920a2ea6a3b6edef44
data/Gemfile CHANGED
@@ -3,6 +3,8 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in url_common.gemspec
4
4
  gemspec
5
5
 
6
+ ruby "2.7.1"
7
+
6
8
  gem "rake", "~> 12.0"
7
9
  gem "rspec", "~> 3.0"
8
10
  gem "fuzzyurl", '~> 0.9.0'
data/Gemfile.lock ADDED
@@ -0,0 +1,72 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ url_common (0.1.1)
5
+ fuzzyurl (~> 0.9.0)
6
+ mechanize (~> 2.6)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ byebug (11.1.3)
12
+ connection_pool (2.2.3)
13
+ diff-lcs (1.4.4)
14
+ domain_name (0.5.20190701)
15
+ unf (>= 0.0.5, < 1.0.0)
16
+ fuzzyurl (0.9.0)
17
+ http-cookie (1.0.3)
18
+ domain_name (~> 0.5)
19
+ mechanize (2.7.6)
20
+ domain_name (~> 0.5, >= 0.5.1)
21
+ http-cookie (~> 1.0)
22
+ mime-types (>= 1.17.2)
23
+ net-http-digest_auth (~> 1.1, >= 1.1.1)
24
+ net-http-persistent (>= 2.5.2)
25
+ nokogiri (~> 1.6)
26
+ ntlm-http (~> 0.1, >= 0.1.1)
27
+ webrobots (>= 0.0.9, < 0.2)
28
+ mime-types (3.3.1)
29
+ mime-types-data (~> 3.2015)
30
+ mime-types-data (3.2020.0512)
31
+ mini_portile2 (2.4.0)
32
+ net-http-digest_auth (1.4.1)
33
+ net-http-persistent (4.0.0)
34
+ connection_pool (~> 2.2)
35
+ nokogiri (1.10.10)
36
+ mini_portile2 (~> 2.4.0)
37
+ ntlm-http (0.1.1)
38
+ rake (12.3.3)
39
+ rspec (3.9.0)
40
+ rspec-core (~> 3.9.0)
41
+ rspec-expectations (~> 3.9.0)
42
+ rspec-mocks (~> 3.9.0)
43
+ rspec-core (3.9.2)
44
+ rspec-support (~> 3.9.3)
45
+ rspec-expectations (3.9.2)
46
+ diff-lcs (>= 1.2.0, < 2.0)
47
+ rspec-support (~> 3.9.0)
48
+ rspec-mocks (3.9.1)
49
+ diff-lcs (>= 1.2.0, < 2.0)
50
+ rspec-support (~> 3.9.0)
51
+ rspec-support (3.9.3)
52
+ unf (0.1.4)
53
+ unf_ext
54
+ unf_ext (0.0.7.7)
55
+ webrobots (0.1.2)
56
+
57
+ PLATFORMS
58
+ ruby
59
+
60
+ DEPENDENCIES
61
+ byebug
62
+ fuzzyurl (~> 0.9.0)
63
+ mechanize (~> 2.6)
64
+ rake (~> 12.0)
65
+ rspec (~> 3.0)
66
+ url_common!
67
+
68
+ RUBY VERSION
69
+ ruby 2.7.1p83
70
+
71
+ BUNDLED WITH
72
+ 2.1.4
data/README.md CHANGED
@@ -1,8 +1,8 @@
1
1
  # UrlCommon
2
2
 
3
- Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/url_common`. To experiment with that code, run `bin/console` for an interactive prompt.
3
+ This is a gem for performing common Url centric things. I wrote this years ago and have always just moved it from project to project leading to a huge number of different versions on my development system. Finally I'm creating a gem out of it to aid in its use across projects.
4
4
 
5
- TODO: Delete this and the text above, and describe your gem
5
+ I don't claim that these are great, perfect, etc. I claim that they are workman like tools which I FIND USEFUL and I want to use them more easily across multiple projects hence the open sourcing of them.
6
6
 
7
7
  ## Installation
8
8
 
@@ -24,6 +24,8 @@ Or install it yourself as:
24
24
 
25
25
  TODO: Write usage instructions here
26
26
 
27
+ This is a todo.
28
+
27
29
  ## Development
28
30
 
29
31
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -1,3 +1,3 @@
1
1
  module UrlCommon
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
data/lib/url_common.rb CHANGED
@@ -266,4 +266,235 @@ module UrlCommon
266
266
 
267
267
  return mechanize_page
268
268
  end
269
+
270
+ #TODO needs tests
271
+ def self.get_meta_description(url, html)
272
+ page = UrlCommon.create_mechanize_page_from_html(url, html)
273
+ description = ""
274
+ begin
275
+ description = page.parser.at("meta[name='description']")['content']
276
+ rescue StandardError => e
277
+ end
278
+ return description
279
+ end
280
+
281
+ #TODO needs tests
282
+ def self.get_page_title(url, html)
283
+ page = UrlCommon.create_mechanize_page_from_html(url, html)
284
+ title = ""
285
+ begin
286
+ title = page.parser.css('title').first.content
287
+ rescue StandardError => e
288
+ end
289
+ return title
290
+ end
291
+
292
+ def self.extract_links_from_text(text)
293
+ agent = Mechanize.new
294
+ html = "<HTML><BODY>#{text}</BODY></HTML>"
295
+ page = Mechanize::Page.new(nil,{'content-type'=>'text/html'},html,nil,agent)
296
+ return page.links
297
+ end
298
+
299
+ # https://docs.aylien.com/textapi/#using-the-api
300
+ def self.summarize_url(url)
301
+ #GET /summarize?url=http://www.bbc.com/sport/0/football/25912393
302
+ agent = Mechanize.new
303
+ summarization_url = ""
304
+ page = agent.get(url)
305
+ end
306
+
307
+ # fucking idiotic test case for this fucking idiot is: https://devslopes.com/
308
+ def self.test_random_url(url_or_host)
309
+ random_filename = TextCommon.sha(Time.now.to_s) + ".xml"
310
+ if url_or_host =~ /http/
311
+ url = File.join(url_or_host, random_filename)
312
+ else
313
+ url = File.join("http://", host, random_filename)
314
+ end
315
+ status, url = UrlCommon.check_for_404(url, true)
316
+ #
317
+ # Key bit of logic -- if we get a return value for a randomized sha then that means that
318
+ # a) the destination site owner is a fucking moron
319
+ # b) that the destination site owner has set his site so it NEVER returns a 404
320
+ # c) they're a fucking moron
321
+ # d) if I get a 200 back then it means that they return you to the home page for anything and NOT
322
+ # a proper 404 so need to flip flop the logic and return error on a 200; sheesh
323
+ #
324
+ return :error, url if status == :ok
325
+ return :ok, url
326
+ end
327
+
328
+ def self.select_best_rssurl_from_rssurls(urls)
329
+ return urls.sort_by(&:length).first
330
+ end
331
+
332
+ def self.possible_rssurls(site_url, skip_slash_blog = false)
333
+ # urls we will probe
334
+ possible_rssurl_formats = []
335
+
336
+ # normal baselines
337
+ possible_rssurl_formats << "feed.xml"
338
+ possible_rssurl_formats << "rss.xml"
339
+ possible_rssurl_formats << "atom.xml"
340
+ possible_rssurl_formats << "feed/"
341
+
342
+ # optionally look at /blog/
343
+ possible_rssurl_formats << "/blog/feed.xml"
344
+ possible_rssurl_formats << "/blog/rss.xml"
345
+ possible_rssurl_formats << "/blog/atom.xml"
346
+ possible_rssurl_formats << "/blog/feed/"
347
+
348
+ possible_rssurls = []
349
+ possible_rssurl_formats.each do |url_format|
350
+ possible_rssurls << UrlCommon.join(site_url, url_format)
351
+ end
352
+
353
+ return possible_rssurls
354
+ end
355
+
356
+ def self.parse_html_for_rssurl_from_head(site_url, page = nil, debug = false)
357
+ if page
358
+ status = :ok
359
+ else
360
+ status, page = UrlCommon.get_page(site_url)
361
+ end
362
+ puts "Into html parse for rssurl" if debug
363
+ possibles = []
364
+ if status == :ok && page
365
+ #results = page.css("link[rel='alternate']")
366
+ results = page.css("link[rel='alternate'][type='application/rss+xml']")
367
+ #
368
+ # If only a single one then return it
369
+ #
370
+ #return results.first['href'] if results.first['type'] =~ /application\/rss\+xml/i && results.size == 1
371
+ return results.first['href'] if results.size == 1
372
+
373
+ #
374
+ # If an array then filter out the comments
375
+ #
376
+ results.each do |result|
377
+ possibles << result unless result['title'] =~ /comments? feed/i
378
+ end
379
+
380
+ #
381
+ # Loop over the possibles and just return the shortest url
382
+ #
383
+ # Todo -- can likely do a better job on this
384
+ #
385
+ urls = []
386
+ possibles.each do |possible|
387
+ urls << possible['href']
388
+ end
389
+ return UrlCommon.select_best_rssurl_from_rssurls(urls)
390
+ #return urls.sort_by(&:length).first
391
+
392
+
393
+ # results.each do |result|
394
+ #
395
+ # end
396
+ # end
397
+ # doc = Nokogiri::HTML(page.body)
398
+ # results << doc.at('link[rel="alternate"]')
399
+ # results = results.flatten
400
+ end
401
+ end
402
+
403
+ def self.get_protocol(url)
404
+ parts = url.to_s.split(":")
405
+ return parts.first
406
+ end
407
+
408
+ #https://500hats.com/feed
409
+ # UrlCommon.discover_feed_url("https://nickjanetakis.com")
410
+ def self.discover_feed_url(site_url, debug = false)
411
+ # step 1: remove the file from the site_url if it has one
412
+ # step 2: problem the common ones and 404 check
413
+
414
+ #
415
+ # Build a set of possibles
416
+ #
417
+ possible_rssurls = UrlCommon.possible_rssurls(site_url)
418
+
419
+ #
420
+ # Keep track of failures
421
+ #
422
+ failed_probes = Set.new
423
+
424
+ # step 3: parse the html
425
+ #<link rel="alternate" type="application/rss+xml" href="http://scripting.com/rss.xml" />
426
+ #<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg &raquo; Feed" href="https://ma.tt/feed/" />
427
+ #<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg &raquo; Comments Feed" href="https://ma.tt/comments/feed/" />
428
+
429
+ #
430
+ # Stage 1 -- do http head probing
431
+ #
432
+ possible_rssurls.each do |rssurl|
433
+ puts "Head Probing for: #{rssurl}" if debug
434
+
435
+ # abort if we doubled blog i.e. /blog/blog/ in the url
436
+ next if rssurl =~ /blog\/blog/
437
+ next if failed_probes.include?(rssurl)
438
+
439
+ status, url = UrlCommon.check_for_404(rssurl, true)
440
+ random_status, random_url = UrlCommon.test_random_url(site_url)
441
+ #debugger
442
+ return rssurl if status == :ok && random_status == :ok
443
+ failed_probes << rssurl
444
+ end
445
+
446
+ puts "After probe, failed_probes as: #{failed_probes.inspect}"
447
+
448
+ #
449
+ # Stage 2-- if subdirectory go up one level and probe again
450
+ #
451
+ # TODO
452
+
453
+
454
+
455
+ #
456
+ # Stage 3 -- Goto root and probe again
457
+ #
458
+ #test for this is the nick site
459
+ fuzzy_url_parts = Fuzzyurl.new(site_url)
460
+ base_url = "#{fuzzy_url_parts.protocol}://#{fuzzy_url_parts.hostname}"
461
+ possible_rssurls = UrlCommon.possible_rssurls(base_url)
462
+ #debugger
463
+ possible_rssurls.each do |rssurl|
464
+ puts "Head Probing for: #{rssurl} at site root stage" #if debug
465
+
466
+ # abort if we doubled blog i.e. /blog/blog/ in the url
467
+ next if rssurl =~ /blog\/blog/
468
+ next if failed_probes.include?(rssurl)
469
+
470
+ status, url = UrlCommon.check_for_404(rssurl, true)
471
+ return rssurl if status == :ok
472
+ failed_probes << rssurl
473
+ end
474
+
475
+
476
+ #
477
+ # Stage 4 - parse the html
478
+ #
479
+ rssurl = UrlCommon.parse_html_for_rssurl_from_head(site_url, nil, true)
480
+ return rssurl if rssurl
481
+
482
+ #
483
+ # Stage 5 - fall over to feedback
484
+ #
485
+ results = Feedbag.find(site_url)
486
+ # checked_results = []
487
+ # results.each do |result|
488
+ # struct = UrlCommon.check_for_404(result)
489
+ # checked_results << result if struct.status == 200
490
+ # end
491
+
492
+ #
493
+ # Stage 6 - cache failures to redis so don't look for them again
494
+ #
495
+ #$redis.
496
+
497
+ return UrlCommon.select_best_rssurl_from_rssurls(results)
498
+ end
499
+
269
500
  end
data/url_common.gemspec CHANGED
@@ -26,4 +26,8 @@ Gem::Specification.new do |spec|
26
26
  spec.bindir = "exe"
27
27
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
28
  spec.require_paths = ["lib"]
29
+
30
+ spec.add_dependency 'fuzzyurl', '~> 0.9.0'
31
+ spec.add_dependency 'mechanize', '~> 2.6'
32
+
29
33
  end
metadata CHANGED
@@ -1,15 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_common
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Scott Johnson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-12 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2022-06-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: fuzzyurl
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.9.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.9.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: mechanize
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.6'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.6'
13
41
  description: This is a class library for common url manipulation and crawling tasks. It
14
42
  is based on a career focused on the practical side of working with the Internet
15
43
  using Ruby.
@@ -24,6 +52,7 @@ files:
24
52
  - ".travis.yml"
25
53
  - CODE_OF_CONDUCT.md
26
54
  - Gemfile
55
+ - Gemfile.lock
27
56
  - LICENSE.txt
28
57
  - README.md
29
58
  - Rakefile