url_common 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4f3290f0cba2dcd19ebc741320e0153f2ba5065f927914f850f956d675fe8752
4
- data.tar.gz: f93a6899d9b39729db16140698c35885c4b000710cc9b5e1dc635dc9a21467aa
3
+ metadata.gz: e75ba5e9d78301e4cf753f4ade0a54b036aaca17d7527791b984d8bc81215a5a
4
+ data.tar.gz: 7bae7598dd4884c9c795e511c32c02cca9095a32f23522e0241842631250488b
5
5
  SHA512:
6
- metadata.gz: a534233ebf72a903303eb4b273459ec717d81fb4acff7daafbfe57f368be7c44b06d9fd1773561506c0b1957b1eac61e5012ff9d582743245b100009e8feaa72
7
- data.tar.gz: 5b2efc559ad70767383a9fd35a458ef524713bb12c1c1c0c3393527e98a7ccde3ab32480793a2a91cd23c7bbbefffaaa3e16f12ca7656c75f7920621ceb9fc37
6
+ metadata.gz: d633158a6b206eca935948a72ebcce4bf90a196c373eca02ae9895819f5b027333b60e4dc5ec84f7b6507d4f468ac1752d36ca367abae4b03b65fd6665857095
7
+ data.tar.gz: a48dc2eace4656f5b2e16957174ae399633d97ddcff830d799ce31dee94ebfcbe648a4e3a1eb78c0ea8a91f63d8df22df783eed7f99d21920a2ea6a3b6edef44
data/Gemfile CHANGED
@@ -3,6 +3,8 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in url_common.gemspec
4
4
  gemspec
5
5
 
6
+ ruby "2.7.1"
7
+
6
8
  gem "rake", "~> 12.0"
7
9
  gem "rspec", "~> 3.0"
8
10
  gem "fuzzyurl", '~> 0.9.0'
data/Gemfile.lock ADDED
@@ -0,0 +1,72 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ url_common (0.1.1)
5
+ fuzzyurl (~> 0.9.0)
6
+ mechanize (~> 2.6)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ byebug (11.1.3)
12
+ connection_pool (2.2.3)
13
+ diff-lcs (1.4.4)
14
+ domain_name (0.5.20190701)
15
+ unf (>= 0.0.5, < 1.0.0)
16
+ fuzzyurl (0.9.0)
17
+ http-cookie (1.0.3)
18
+ domain_name (~> 0.5)
19
+ mechanize (2.7.6)
20
+ domain_name (~> 0.5, >= 0.5.1)
21
+ http-cookie (~> 1.0)
22
+ mime-types (>= 1.17.2)
23
+ net-http-digest_auth (~> 1.1, >= 1.1.1)
24
+ net-http-persistent (>= 2.5.2)
25
+ nokogiri (~> 1.6)
26
+ ntlm-http (~> 0.1, >= 0.1.1)
27
+ webrobots (>= 0.0.9, < 0.2)
28
+ mime-types (3.3.1)
29
+ mime-types-data (~> 3.2015)
30
+ mime-types-data (3.2020.0512)
31
+ mini_portile2 (2.4.0)
32
+ net-http-digest_auth (1.4.1)
33
+ net-http-persistent (4.0.0)
34
+ connection_pool (~> 2.2)
35
+ nokogiri (1.10.10)
36
+ mini_portile2 (~> 2.4.0)
37
+ ntlm-http (0.1.1)
38
+ rake (12.3.3)
39
+ rspec (3.9.0)
40
+ rspec-core (~> 3.9.0)
41
+ rspec-expectations (~> 3.9.0)
42
+ rspec-mocks (~> 3.9.0)
43
+ rspec-core (3.9.2)
44
+ rspec-support (~> 3.9.3)
45
+ rspec-expectations (3.9.2)
46
+ diff-lcs (>= 1.2.0, < 2.0)
47
+ rspec-support (~> 3.9.0)
48
+ rspec-mocks (3.9.1)
49
+ diff-lcs (>= 1.2.0, < 2.0)
50
+ rspec-support (~> 3.9.0)
51
+ rspec-support (3.9.3)
52
+ unf (0.1.4)
53
+ unf_ext
54
+ unf_ext (0.0.7.7)
55
+ webrobots (0.1.2)
56
+
57
+ PLATFORMS
58
+ ruby
59
+
60
+ DEPENDENCIES
61
+ byebug
62
+ fuzzyurl (~> 0.9.0)
63
+ mechanize (~> 2.6)
64
+ rake (~> 12.0)
65
+ rspec (~> 3.0)
66
+ url_common!
67
+
68
+ RUBY VERSION
69
+ ruby 2.7.1p83
70
+
71
+ BUNDLED WITH
72
+ 2.1.4
data/README.md CHANGED
@@ -1,8 +1,8 @@
1
1
  # UrlCommon
2
2
 
3
- Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/url_common`. To experiment with that code, run `bin/console` for an interactive prompt.
3
+ This is a gem for performing common Url centric things. I wrote this years ago and have always just moved it from project to project leading to a huge number of different versions on my development system. Finally I'm creating a gem out of it to aid in its use across projects.
4
4
 
5
- TODO: Delete this and the text above, and describe your gem
5
+ I don't claim that these are great, perfect, etc. I claim that they are workman like tools which I FIND USEFUL and I want to use them more easily across multiple projects hence the open sourcing of them.
6
6
 
7
7
  ## Installation
8
8
 
@@ -24,6 +24,8 @@ Or install it yourself as:
24
24
 
25
25
  TODO: Write usage instructions here
26
26
 
27
+ This is a todo.
28
+
27
29
  ## Development
28
30
 
29
31
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -1,3 +1,3 @@
1
1
  module UrlCommon
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
data/lib/url_common.rb CHANGED
@@ -266,4 +266,235 @@ module UrlCommon
266
266
 
267
267
  return mechanize_page
268
268
  end
269
+
270
+ #TODO needs tests
271
+ def self.get_meta_description(url, html)
272
+ page = UrlCommon.create_mechanize_page_from_html(url, html)
273
+ description = ""
274
+ begin
275
+ description = page.parser.at("meta[name='description']")['content']
276
+ rescue StandardError => e
277
+ end
278
+ return description
279
+ end
280
+
281
+ #TODO needs tests
282
+ def self.get_page_title(url, html)
283
+ page = UrlCommon.create_mechanize_page_from_html(url, html)
284
+ title = ""
285
+ begin
286
+ title = page.parser.css('title').first.content
287
+ rescue StandardError => e
288
+ end
289
+ return title
290
+ end
291
+
292
+ def self.extract_links_from_text(text)
293
+ agent = Mechanize.new
294
+ html = "<HTML><BODY>#{text}</BODY></HTML>"
295
+ page = Mechanize::Page.new(nil,{'content-type'=>'text/html'},html,nil,agent)
296
+ return page.links
297
+ end
298
+
299
+ # https://docs.aylien.com/textapi/#using-the-api
300
+ def self.summarize_url(url)
301
+ #GET /summarize?url=http://www.bbc.com/sport/0/football/25912393
302
+ agent = Mechanize.new
303
+ summarization_url = ""
304
+ page = agent.get(url)
305
+ end
306
+
307
+ # fucking idiotic test case for this fucking idiot is: https://devslopes.com/
308
+ def self.test_random_url(url_or_host)
309
+ random_filename = TextCommon.sha(Time.now.to_s) + ".xml"
310
+ if url_or_host =~ /http/
311
+ url = File.join(url_or_host, random_filename)
312
+ else
313
+ url = File.join("http://", host, random_filename)
314
+ end
315
+ status, url = UrlCommon.check_for_404(url, true)
316
+ #
317
+ # Key bit of logic -- if we get a return value for a randomized sha then that means that
318
+ # a) the destination site owner is a fucking moron
319
+ # b) that the destination site owner has set his site so it NEVER returns a 404
320
+ # c) they're a fucking moron
321
+ # d) if I get a 200 back then it means that they return you to the home page for anything and NOT
322
+ # a proper 404 so need to flip flop the logic and return error on a 200; sheesh
323
+ #
324
+ return :error, url if status == :ok
325
+ return :ok, url
326
+ end
327
+
328
+ def self.select_best_rssurl_from_rssurls(urls)
329
+ return urls.sort_by(&:length).first
330
+ end
331
+
332
+ def self.possible_rssurls(site_url, skip_slash_blog = false)
333
+ # urls we will probe
334
+ possible_rssurl_formats = []
335
+
336
+ # normal baselines
337
+ possible_rssurl_formats << "feed.xml"
338
+ possible_rssurl_formats << "rss.xml"
339
+ possible_rssurl_formats << "atom.xml"
340
+ possible_rssurl_formats << "feed/"
341
+
342
+ # optionally look at /blog/
343
+ possible_rssurl_formats << "/blog/feed.xml"
344
+ possible_rssurl_formats << "/blog/rss.xml"
345
+ possible_rssurl_formats << "/blog/atom.xml"
346
+ possible_rssurl_formats << "/blog/feed/"
347
+
348
+ possible_rssurls = []
349
+ possible_rssurl_formats.each do |url_format|
350
+ possible_rssurls << UrlCommon.join(site_url, url_format)
351
+ end
352
+
353
+ return possible_rssurls
354
+ end
355
+
356
+ def self.parse_html_for_rssurl_from_head(site_url, page = nil, debug = false)
357
+ if page
358
+ status = :ok
359
+ else
360
+ status, page = UrlCommon.get_page(site_url)
361
+ end
362
+ puts "Into html parse for rssurl" if debug
363
+ possibles = []
364
+ if status == :ok && page
365
+ #results = page.css("link[rel='alternate']")
366
+ results = page.css("link[rel='alternate'][type='application/rss+xml']")
367
+ #
368
+ # If only a single one then return it
369
+ #
370
+ #return results.first['href'] if results.first['type'] =~ /application\/rss\+xml/i && results.size == 1
371
+ return results.first['href'] if results.size == 1
372
+
373
+ #
374
+ # If an array then filter out the comments
375
+ #
376
+ results.each do |result|
377
+ possibles << result unless result['title'] =~ /comments? feed/i
378
+ end
379
+
380
+ #
381
+ # Loop over the possibles and just return the shortest url
382
+ #
383
+ # Todo -- can likely do a better job on this
384
+ #
385
+ urls = []
386
+ possibles.each do |possible|
387
+ urls << possible['href']
388
+ end
389
+ return UrlCommon.select_best_rssurl_from_rssurls(urls)
390
+ #return urls.sort_by(&:length).first
391
+
392
+
393
+ # results.each do |result|
394
+ #
395
+ # end
396
+ # end
397
+ # doc = Nokogiri::HTML(page.body)
398
+ # results << doc.at('link[rel="alternate"]')
399
+ # results = results.flatten
400
+ end
401
+ end
402
+
403
+ def self.get_protocol(url)
404
+ parts = url.to_s.split(":")
405
+ return parts.first
406
+ end
407
+
408
+ #https://500hats.com/feed
409
+ # UrlCommon.discover_feed_url("https://nickjanetakis.com")
410
+ def self.discover_feed_url(site_url, debug = false)
411
+ # step 1: remove the file from the site_url if it has one
412
+ # step 2: problem the common ones and 404 check
413
+
414
+ #
415
+ # Build a set of possibles
416
+ #
417
+ possible_rssurls = UrlCommon.possible_rssurls(site_url)
418
+
419
+ #
420
+ # Keep track of failures
421
+ #
422
+ failed_probes = Set.new
423
+
424
+ # step 3: parse the html
425
+ #<link rel="alternate" type="application/rss+xml" href="http://scripting.com/rss.xml" />
426
+ #<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg &raquo; Feed" href="https://ma.tt/feed/" />
427
+ #<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg &raquo; Comments Feed" href="https://ma.tt/comments/feed/" />
428
+
429
+ #
430
+ # Stage 1 -- do http head probing
431
+ #
432
+ possible_rssurls.each do |rssurl|
433
+ puts "Head Probing for: #{rssurl}" if debug
434
+
435
+ # abort if we doubled blog i.e. /blog/blog/ in the url
436
+ next if rssurl =~ /blog\/blog/
437
+ next if failed_probes.include?(rssurl)
438
+
439
+ status, url = UrlCommon.check_for_404(rssurl, true)
440
+ random_status, random_url = UrlCommon.test_random_url(site_url)
441
+ #debugger
442
+ return rssurl if status == :ok && random_status == :ok
443
+ failed_probes << rssurl
444
+ end
445
+
446
+ puts "After probe, failed_probes as: #{failed_probes.inspect}"
447
+
448
+ #
449
+ # Stage 2-- if subdirectory go up one level and probe again
450
+ #
451
+ # TODO
452
+
453
+
454
+
455
+ #
456
+ # Stage 3 -- Goto root and probe again
457
+ #
458
+ #test for this is the nick site
459
+ fuzzy_url_parts = Fuzzyurl.new(site_url)
460
+ base_url = "#{fuzzy_url_parts.protocol}://#{fuzzy_url_parts.hostname}"
461
+ possible_rssurls = UrlCommon.possible_rssurls(base_url)
462
+ #debugger
463
+ possible_rssurls.each do |rssurl|
464
+ puts "Head Probing for: #{rssurl} at site root stage" #if debug
465
+
466
+ # abort if we doubled blog i.e. /blog/blog/ in the url
467
+ next if rssurl =~ /blog\/blog/
468
+ next if failed_probes.include?(rssurl)
469
+
470
+ status, url = UrlCommon.check_for_404(rssurl, true)
471
+ return rssurl if status == :ok
472
+ failed_probes << rssurl
473
+ end
474
+
475
+
476
+ #
477
+ # Stage 4 - parse the html
478
+ #
479
+ rssurl = UrlCommon.parse_html_for_rssurl_from_head(site_url, nil, true)
480
+ return rssurl if rssurl
481
+
482
+ #
483
+ # Stage 5 - fall over to feedback
484
+ #
485
+ results = Feedbag.find(site_url)
486
+ # checked_results = []
487
+ # results.each do |result|
488
+ # struct = UrlCommon.check_for_404(result)
489
+ # checked_results << result if struct.status == 200
490
+ # end
491
+
492
+ #
493
+ # Stage 6 - cache failures to redis so don't look for them again
494
+ #
495
+ #$redis.
496
+
497
+ return UrlCommon.select_best_rssurl_from_rssurls(results)
498
+ end
499
+
269
500
  end
data/url_common.gemspec CHANGED
@@ -26,4 +26,8 @@ Gem::Specification.new do |spec|
26
26
  spec.bindir = "exe"
27
27
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
28
  spec.require_paths = ["lib"]
29
+
30
+ spec.add_dependency 'fuzzyurl', '~> 0.9.0'
31
+ spec.add_dependency 'mechanize', '~> 2.6'
32
+
29
33
  end
metadata CHANGED
@@ -1,15 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_common
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Scott Johnson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-12 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2022-06-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: fuzzyurl
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.9.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.9.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: mechanize
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.6'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.6'
13
41
  description: This is a class library for common url manipulation and crawling tasks. It
14
42
  is based on a career focused on the practical side of working with the Internet
15
43
  using Ruby.
@@ -24,6 +52,7 @@ files:
24
52
  - ".travis.yml"
25
53
  - CODE_OF_CONDUCT.md
26
54
  - Gemfile
55
+ - Gemfile.lock
27
56
  - LICENSE.txt
28
57
  - README.md
29
58
  - Rakefile