url_common 0.1.0 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4f3290f0cba2dcd19ebc741320e0153f2ba5065f927914f850f956d675fe8752
4
- data.tar.gz: f93a6899d9b39729db16140698c35885c4b000710cc9b5e1dc635dc9a21467aa
3
+ metadata.gz: 0ab8c804a828390106ee6e442cd46362ef5648ff9d41d36317f2644cb9f21e15
4
+ data.tar.gz: 7d6068a766ed82a3c4fe724fe426549c9c56047acbf373eb46d87aba763dfab0
5
5
  SHA512:
6
- metadata.gz: a534233ebf72a903303eb4b273459ec717d81fb4acff7daafbfe57f368be7c44b06d9fd1773561506c0b1957b1eac61e5012ff9d582743245b100009e8feaa72
7
- data.tar.gz: 5b2efc559ad70767383a9fd35a458ef524713bb12c1c1c0c3393527e98a7ccde3ab32480793a2a91cd23c7bbbefffaaa3e16f12ca7656c75f7920621ceb9fc37
6
+ metadata.gz: bddc296b7dae2781ec1ae7f8b8c79a1327e93bc1bfe30c761366033ef0882049948ea8c2ebbd7ce425aa7d437086b7ec38a4b30ebf0e976a4213990ef1e2d2b6
7
+ data.tar.gz: 395a2d6fec4519067d4dbb08f78c3e67cd4b1d2b1c7c8083299398790c5b9f9e6db81aade8c093f6b2eb39975f30b6923a80634d928e9c6fad159e12bb6a840d
data/Gemfile CHANGED
@@ -3,8 +3,15 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in url_common.gemspec
4
4
  gemspec
5
5
 
6
+ ruby "3.1.2"
7
+
6
8
  gem "rake", "~> 12.0"
7
9
  gem "rspec", "~> 3.0"
8
10
  gem "fuzzyurl", '~> 0.9.0'
9
- gem 'mechanize', '~> 2.6'
10
11
  gem "byebug"
12
+
13
+ gem "hpricot", "~> 0.8.6"
14
+ gem 'net-http-persistent', github: 'drbrain/net-http-persistent'
15
+ gem "mechanize", "~> 2.7"
16
+
17
+ gem "webrick", "~> 1.7"
data/Gemfile.lock ADDED
@@ -0,0 +1,82 @@
1
+ GIT
2
+ remote: https://github.com/drbrain/net-http-persistent.git
3
+ revision: 857c3baaa541644fa437328b535042a500414119
4
+ specs:
5
+ net-http-persistent (4.0.1)
6
+ connection_pool (~> 2.2)
7
+
8
+ PATH
9
+ remote: .
10
+ specs:
11
+ url_common (0.1.3)
12
+ fuzzyurl (~> 0.9.0)
13
+ mechanize (~> 2.6)
14
+
15
+ GEM
16
+ remote: https://rubygems.org/
17
+ specs:
18
+ byebug (11.1.3)
19
+ connection_pool (2.2.5)
20
+ diff-lcs (1.4.4)
21
+ domain_name (0.5.20190701)
22
+ unf (>= 0.0.5, < 1.0.0)
23
+ fuzzyurl (0.9.0)
24
+ hpricot (0.8.6)
25
+ http-cookie (1.0.3)
26
+ domain_name (~> 0.5)
27
+ mechanize (2.7.6)
28
+ domain_name (~> 0.5, >= 0.5.1)
29
+ http-cookie (~> 1.0)
30
+ mime-types (>= 1.17.2)
31
+ net-http-digest_auth (~> 1.1, >= 1.1.1)
32
+ net-http-persistent (>= 2.5.2)
33
+ nokogiri (~> 1.6)
34
+ ntlm-http (~> 0.1, >= 0.1.1)
35
+ webrobots (>= 0.0.9, < 0.2)
36
+ mime-types (3.3.1)
37
+ mime-types-data (~> 3.2015)
38
+ mime-types-data (3.2020.0512)
39
+ mini_portile2 (2.4.0)
40
+ net-http-digest_auth (1.4.1)
41
+ nokogiri (1.10.10)
42
+ mini_portile2 (~> 2.4.0)
43
+ ntlm-http (0.1.1)
44
+ rake (12.3.3)
45
+ rspec (3.9.0)
46
+ rspec-core (~> 3.9.0)
47
+ rspec-expectations (~> 3.9.0)
48
+ rspec-mocks (~> 3.9.0)
49
+ rspec-core (3.9.2)
50
+ rspec-support (~> 3.9.3)
51
+ rspec-expectations (3.9.2)
52
+ diff-lcs (>= 1.2.0, < 2.0)
53
+ rspec-support (~> 3.9.0)
54
+ rspec-mocks (3.9.1)
55
+ diff-lcs (>= 1.2.0, < 2.0)
56
+ rspec-support (~> 3.9.0)
57
+ rspec-support (3.9.3)
58
+ unf (0.1.4)
59
+ unf_ext
60
+ unf_ext (0.0.7.7)
61
+ webrick (1.7.0)
62
+ webrobots (0.1.2)
63
+
64
+ PLATFORMS
65
+ ruby
66
+
67
+ DEPENDENCIES
68
+ byebug
69
+ fuzzyurl (~> 0.9.0)
70
+ hpricot (~> 0.8.6)
71
+ mechanize (~> 2.7)
72
+ net-http-persistent!
73
+ rake (~> 12.0)
74
+ rspec (~> 3.0)
75
+ url_common!
76
+ webrick (~> 1.7)
77
+
78
+ RUBY VERSION
79
+ ruby 3.1.2p20
80
+
81
+ BUNDLED WITH
82
+ 2.1.4
data/README.md CHANGED
@@ -1,8 +1,8 @@
1
1
  # UrlCommon
2
2
 
3
- Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/url_common`. To experiment with that code, run `bin/console` for an interactive prompt.
3
+ This is a gem for performing common Url centric things. I wrote this years ago and have always just moved it from project to project leading to a huge number of different versions on my development system. Finally I'm creating a gem out of it to aid in its use across projects.
4
4
 
5
- TODO: Delete this and the text above, and describe your gem
5
+ I don't claim that these are great, perfect, etc. I claim that they are workman like tools which I FIND USEFUL and I want to use them more easily across multiple projects hence the open sourcing of them.
6
6
 
7
7
  ## Installation
8
8
 
@@ -24,6 +24,8 @@ Or install it yourself as:
24
24
 
25
25
  TODO: Write usage instructions here
26
26
 
27
+ This is a todo.
28
+
27
29
  ## Development
28
30
 
29
31
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -1,3 +1,3 @@
1
1
  module UrlCommon
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.3"
3
3
  end
data/lib/url_common.rb CHANGED
@@ -30,6 +30,17 @@ module UrlCommon
30
30
  end
31
31
  end
32
32
 
33
+ # UrlCommon.parse_fid_from_amazon_url("https://www.amazon.com/Original-GEN-2-0-Screwdriver-Industrial-Technician/dp/B0845919P2/?_encoding=UTF8&pd_rd_w=cekvo&content-id=amzn1.sym.bbb6bbd8-d236-47cb-b42f-734cb0cacc1f&pf_rd_p=bbb6bbd8-d236-47cb-b42f-734cb0cacc1f&pf_rd_r=3WP00V89EKYCQ1PB16VY&pd_rd_wg=HlQVt&pd_rd_r=30b33abe-2010-435e-b2cc-338f2ffbf3cf&ref_=pd_gw_ci_mcx_mi")
34
+ def self.parse_fid_from_amazon_url(url)
35
+ tmp = /\/dp\/([A-Za-z0-9]+)/.match(url)
36
+ if tmp && tmp[1]
37
+ return tmp[1]
38
+ else
39
+ return nil
40
+ end
41
+ end
42
+
43
+
33
44
  def self.parse_country_from_itunes_url(url)
34
45
  country = /https?:\/\/itunes\.apple\.com\/(..)\//.match(url)
35
46
  if country
@@ -39,9 +50,22 @@ module UrlCommon
39
50
  return 'us'
40
51
  end
41
52
 
53
+ # original
54
+ # def self.get_base_domain(url)
55
+ # parts = URI.parse(url)
56
+ # return parts.host.gsub(/^www./,'')
57
+ # end
58
+
42
59
  def self.get_base_domain(url)
43
- parts = URI.parse(url)
44
- return parts.host.gsub(/^www./,'')
60
+ #debugger if url =~ /c06rh22whx1g/
61
+ begin
62
+ url = url.gsub(/ /,'%20')
63
+ parts = URI.parse(url)
64
+ return parts.host.gsub(/^www./,'')
65
+ rescue StandardError => e
66
+ fu = Fuzzyurl.from_string(url)
67
+ return fu.hostname.gsub(/^www./,'')
68
+ end
45
69
  end
46
70
 
47
71
  def self.join(base, rest, debug = false)
@@ -60,9 +84,19 @@ module UrlCommon
60
84
  end
61
85
  end
62
86
 
63
- #TODO
64
87
  def self.count_links(html)
65
- return 0
88
+ if html =~ /<html/i
89
+ content_type = "html"
90
+ else
91
+ content_type = "ascii"
92
+ end
93
+ parts = html.split(" ")
94
+ link_ctr = 0
95
+ parts.each do |part|
96
+ link_ctr = link_ctr + 1 if part =~ /https:?\/\// && content_type == 'ascii'
97
+ link_ctr = link_ctr + 1 if part =~ /<a [^>]+.+<\/a>/i && content_type == 'html'
98
+ end
99
+ link_ctr
66
100
  end
67
101
 
68
102
  def self.agent
@@ -82,13 +116,23 @@ module UrlCommon
82
116
  #
83
117
  def self.url_base(url, base_domain=nil)
84
118
  if base_domain.nil?
85
- base_domain = get_base_domain(url)
119
+ base_domain = UrlCommon.get_base_domain(url)
120
+ end
121
+ begin
122
+ url = url.gsub(/ /,'%20')
123
+ parts = URI.parse(url)
124
+ extra = ""
125
+ extra = "?#{parts.query}" if parts.query
126
+ url_base = "#{base_domain}#{parts.path}#{extra}"
127
+ return url_base[0..254]
128
+ rescue StandardError => e
129
+ fu = Fuzzyurl.from_string(url)
130
+ base_domain = UrlCommon.get_base_domain(url)
131
+ extra = ""
132
+ extra = "?#{fu.query}" if fu.query
133
+ url_base = "#{base_domain}#{fu.path}#{extra}"
134
+ return url_base[0..254]
86
135
  end
87
- parts = URI.parse(url)
88
- extra = ""
89
- extra = "?#{parts.query}" if parts.query
90
- url_base = "#{base_domain}#{parts.path}#{extra}"
91
- return url_base[0..254]
92
136
  end
93
137
 
94
138
  #tested #https://www.amazon.com/gp/product/B01DT4A2R4/ref=as_li_qf_sp_asin_il_tl?ie=UTF8&tag=nickjanetakis-20&camp=1789&creative=9325&linkCode=as2&creativeASIN=B01DT4A2R4&linkId=496be5e222b6291369c0a393c797c2c0
@@ -262,8 +306,241 @@ module UrlCommon
262
306
  #TODO needs tests
263
307
  def self.create_mechanize_page_from_html(url, html)
264
308
  mechanize_page = Mechanize::Page.new(nil, {'content-type'=>'text/html'}, html, nil, Mechanize.new)
309
+ url = url.gsub(/ /,'%20')
265
310
  mechanize_page.uri = URI.parse(url)
266
-
311
+
267
312
  return mechanize_page
268
313
  end
314
+
315
+ #TODO needs tests
316
+ def self.get_meta_description(url, html)
317
+ page = UrlCommon.create_mechanize_page_from_html(url, html)
318
+ description = ""
319
+ begin
320
+ description = page.parser.at("meta[name='description']")['content']
321
+ rescue StandardError => e
322
+ end
323
+ return description
324
+ end
325
+
326
+ #TODO needs tests
327
+ # UrlCommon.get_page_title("https://gist.github.com/fuzzygroup/811a9334b1a6dc394de74a23cb7e12fa")
328
+ def self.get_page_title(url, html)
329
+ page = UrlCommon.create_mechanize_page_from_html(url, html)
330
+ title = ""
331
+ begin
332
+ title = page.parser.css('title').first.content
333
+ rescue StandardError => e
334
+ end
335
+ return title
336
+ end
337
+
338
+ def self.extract_links_from_text(text)
339
+ agent = Mechanize.new
340
+ html = "<HTML><BODY>#{text}</BODY></HTML>"
341
+ page = Mechanize::Page.new(nil,{'content-type'=>'text/html'},html,nil,agent)
342
+ return page.links
343
+ end
344
+
345
+ # https://docs.aylien.com/textapi/#using-the-api
346
+ def self.summarize_url(url)
347
+ #GET /summarize?url=http://www.bbc.com/sport/0/football/25912393
348
+ agent = Mechanize.new
349
+ summarization_url = ""
350
+ page = agent.get(url)
351
+ end
352
+
353
+ # fucking idiotic test case for this fucking idiot is: https://devslopes.com/
354
+ def self.test_random_url(url_or_host)
355
+ random_filename = TextCommon.sha(Time.now.to_s) + ".xml"
356
+ if url_or_host =~ /http/
357
+ url = File.join(url_or_host, random_filename)
358
+ else
359
+ url = File.join("http://", host, random_filename)
360
+ end
361
+ status, url = UrlCommon.check_for_404(url, true)
362
+ #
363
+ # Key bit of logic -- if we get a return value for a randomized sha then that means that
364
+ # a) the destination site owner is a fucking moron
365
+ # b) that the destination site owner has set his site so it NEVER returns a 404
366
+ # c) they're a fucking moron
367
+ # d) if I get a 200 back then it means that they return you to the home page for anything and NOT
368
+ # a proper 404 so need to flip flop the logic and return error on a 200; sheesh
369
+ #
370
+ return :error, url if status == :ok
371
+ return :ok, url
372
+ end
373
+
374
+ def self.select_best_rssurl_from_rssurls(urls)
375
+ return urls.sort_by(&:length).first
376
+ end
377
+
378
+ def self.possible_rssurls(site_url, skip_slash_blog = false)
379
+ # urls we will probe
380
+ possible_rssurl_formats = []
381
+
382
+ # normal baselines
383
+ possible_rssurl_formats << "feed.xml"
384
+ possible_rssurl_formats << "rss.xml"
385
+ possible_rssurl_formats << "atom.xml"
386
+ possible_rssurl_formats << "feed/"
387
+
388
+ # optionally look at /blog/
389
+ possible_rssurl_formats << "/blog/feed.xml"
390
+ possible_rssurl_formats << "/blog/rss.xml"
391
+ possible_rssurl_formats << "/blog/atom.xml"
392
+ possible_rssurl_formats << "/blog/feed/"
393
+
394
+ possible_rssurls = []
395
+ possible_rssurl_formats.each do |url_format|
396
+ possible_rssurls << UrlCommon.join(site_url, url_format)
397
+ end
398
+
399
+ return possible_rssurls
400
+ end
401
+
402
+ def self.parse_html_for_rssurl_from_head(site_url, page = nil, debug = false)
403
+ if page
404
+ status = :ok
405
+ else
406
+ status, page = UrlCommon.get_page(site_url)
407
+ end
408
+ puts "Into html parse for rssurl" if debug
409
+ possibles = []
410
+ if status == :ok && page
411
+ #results = page.css("link[rel='alternate']")
412
+ results = page.css("link[rel='alternate'][type='application/rss+xml']")
413
+ #
414
+ # If only a single one then return it
415
+ #
416
+ #return results.first['href'] if results.first['type'] =~ /application\/rss\+xml/i && results.size == 1
417
+ return results.first['href'] if results.size == 1
418
+
419
+ #
420
+ # If an array then filter out the comments
421
+ #
422
+ results.each do |result|
423
+ possibles << result unless result['title'] =~ /comments? feed/i
424
+ end
425
+
426
+ #
427
+ # Loop over the possibles and just return the shortest url
428
+ #
429
+ # Todo -- can likely do a better job on this
430
+ #
431
+ urls = []
432
+ possibles.each do |possible|
433
+ urls << possible['href']
434
+ end
435
+ return UrlCommon.select_best_rssurl_from_rssurls(urls)
436
+ #return urls.sort_by(&:length).first
437
+
438
+
439
+ # results.each do |result|
440
+ #
441
+ # end
442
+ # end
443
+ # doc = Nokogiri::HTML(page.body)
444
+ # results << doc.at('link[rel="alternate"]')
445
+ # results = results.flatten
446
+ end
447
+ end
448
+
449
+ def self.get_protocol(url)
450
+ parts = url.to_s.split(":")
451
+ return parts.first
452
+ end
453
+
454
+ #https://500hats.com/feed
455
+ # UrlCommon.discover_feed_url("https://nickjanetakis.com")
456
+ def self.discover_feed_url(site_url, debug = false)
457
+ # step 1: remove the file from the site_url if it has one
458
+ # step 2: problem the common ones and 404 check
459
+
460
+ #
461
+ # Build a set of possibles
462
+ #
463
+ possible_rssurls = UrlCommon.possible_rssurls(site_url)
464
+
465
+ #
466
+ # Keep track of failures
467
+ #
468
+ failed_probes = Set.new
469
+
470
+ # step 3: parse the html
471
+ #<link rel="alternate" type="application/rss+xml" href="http://scripting.com/rss.xml" />
472
+ #<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg &raquo; Feed" href="https://ma.tt/feed/" />
473
+ #<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg &raquo; Comments Feed" href="https://ma.tt/comments/feed/" />
474
+
475
+ #
476
+ # Stage 1 -- do http head probing
477
+ #
478
+ possible_rssurls.each do |rssurl|
479
+ puts "Head Probing for: #{rssurl}" if debug
480
+
481
+ # abort if we doubled blog i.e. /blog/blog/ in the url
482
+ next if rssurl =~ /blog\/blog/
483
+ next if failed_probes.include?(rssurl)
484
+
485
+ status, url = UrlCommon.check_for_404(rssurl, true)
486
+ random_status, random_url = UrlCommon.test_random_url(site_url)
487
+ #debugger
488
+ return rssurl if status == :ok && random_status == :ok
489
+ failed_probes << rssurl
490
+ end
491
+
492
+ puts "After probe, failed_probes as: #{failed_probes.inspect}"
493
+
494
+ #
495
+ # Stage 2-- if subdirectory go up one level and probe again
496
+ #
497
+ # TODO
498
+
499
+
500
+
501
+ #
502
+ # Stage 3 -- Goto root and probe again
503
+ #
504
+ #test for this is the nick site
505
+ fuzzy_url_parts = Fuzzyurl.new(site_url)
506
+ base_url = "#{fuzzy_url_parts.protocol}://#{fuzzy_url_parts.hostname}"
507
+ possible_rssurls = UrlCommon.possible_rssurls(base_url)
508
+ #debugger
509
+ possible_rssurls.each do |rssurl|
510
+ puts "Head Probing for: #{rssurl} at site root stage" #if debug
511
+
512
+ # abort if we doubled blog i.e. /blog/blog/ in the url
513
+ next if rssurl =~ /blog\/blog/
514
+ next if failed_probes.include?(rssurl)
515
+
516
+ status, url = UrlCommon.check_for_404(rssurl, true)
517
+ return rssurl if status == :ok
518
+ failed_probes << rssurl
519
+ end
520
+
521
+
522
+ #
523
+ # Stage 4 - parse the html
524
+ #
525
+ rssurl = UrlCommon.parse_html_for_rssurl_from_head(site_url, nil, true)
526
+ return rssurl if rssurl
527
+
528
+ #
529
+ # Stage 5 - fall over to feedback
530
+ #
531
+ results = Feedbag.find(site_url)
532
+ # checked_results = []
533
+ # results.each do |result|
534
+ # struct = UrlCommon.check_for_404(result)
535
+ # checked_results << result if struct.status == 200
536
+ # end
537
+
538
+ #
539
+ # Stage 6 - cache failures to redis so don't look for them again
540
+ #
541
+ #$redis.
542
+
543
+ return UrlCommon.select_best_rssurl_from_rssurls(results)
544
+ end
545
+
269
546
  end
data/url_common.gemspec CHANGED
@@ -26,4 +26,8 @@ Gem::Specification.new do |spec|
26
26
  spec.bindir = "exe"
27
27
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
28
  spec.require_paths = ["lib"]
29
+
30
+ spec.add_dependency 'fuzzyurl', '~> 0.9.0'
31
+ spec.add_dependency 'mechanize', '~> 2.6'
32
+
29
33
  end
metadata CHANGED
@@ -1,15 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_common
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Scott Johnson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-12 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2022-07-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: fuzzyurl
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.9.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.9.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: mechanize
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.6'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.6'
13
41
  description: This is a class library for common url manipulation and crawling tasks. It
14
42
  is based on a career focused on the practical side of working with the Internet
15
43
  using Ruby.
@@ -24,6 +52,7 @@ files:
24
52
  - ".travis.yml"
25
53
  - CODE_OF_CONDUCT.md
26
54
  - Gemfile
55
+ - Gemfile.lock
27
56
  - LICENSE.txt
28
57
  - README.md
29
58
  - Rakefile
@@ -55,7 +84,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
55
84
  - !ruby/object:Gem::Version
56
85
  version: '0'
57
86
  requirements: []
58
- rubygems_version: 3.1.2
87
+ rubygems_version: 3.3.7
59
88
  signing_key:
60
89
  specification_version: 4
61
90
  summary: This is a class library designed for common url manipulation and crawling