url_common 0.1.1 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e75ba5e9d78301e4cf753f4ade0a54b036aaca17d7527791b984d8bc81215a5a
4
- data.tar.gz: 7bae7598dd4884c9c795e511c32c02cca9095a32f23522e0241842631250488b
3
+ metadata.gz: bf016ee73428bc8dc46afb13ddf8aa1e21840650e38ac0b6c03d91c69453afca
4
+ data.tar.gz: '098c5fcaa13ee7bf0b390bfae0af56784ed25547d7b0946591c2619344f004ca'
5
5
  SHA512:
6
- metadata.gz: d633158a6b206eca935948a72ebcce4bf90a196c373eca02ae9895819f5b027333b60e4dc5ec84f7b6507d4f468ac1752d36ca367abae4b03b65fd6665857095
7
- data.tar.gz: a48dc2eace4656f5b2e16957174ae399633d97ddcff830d799ce31dee94ebfcbe648a4e3a1eb78c0ea8a91f63d8df22df783eed7f99d21920a2ea6a3b6edef44
6
+ metadata.gz: 780507307f3b1c6745cddba4eeb37cbad88a623c8b6f694a51798c0347d80db5fffbc2210ffafe0b272e270ecc48bf036c35c7071384a656bcf0b0ca30cdc791
7
+ data.tar.gz: bd449bfe93ecdfe735c947c8d6caba91962493778cbfe899cc456a5f056db2102c3ff208084f3fbd3ce47b3383c4046d3a1c4a6c5a5504e6974a81e9b809d0cd
data/Gemfile CHANGED
@@ -3,10 +3,15 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in url_common.gemspec
4
4
  gemspec
5
5
 
6
- ruby "2.7.1"
6
+ ruby "3.1.2"
7
7
 
8
8
  gem "rake", "~> 12.0"
9
9
  gem "rspec", "~> 3.0"
10
10
  gem "fuzzyurl", '~> 0.9.0'
11
- gem 'mechanize', '~> 2.6'
12
11
  gem "byebug"
12
+
13
+ gem "hpricot", "~> 0.8.6"
14
+ gem 'net-http-persistent', github: 'drbrain/net-http-persistent'
15
+ gem "mechanize", "~> 2.7"
16
+
17
+ gem "webrick", "~> 1.7"
data/Gemfile.lock CHANGED
@@ -1,7 +1,14 @@
1
+ GIT
2
+ remote: https://github.com/drbrain/net-http-persistent.git
3
+ revision: 857c3baaa541644fa437328b535042a500414119
4
+ specs:
5
+ net-http-persistent (4.0.1)
6
+ connection_pool (~> 2.2)
7
+
1
8
  PATH
2
9
  remote: .
3
10
  specs:
4
- url_common (0.1.1)
11
+ url_common (0.1.5)
5
12
  fuzzyurl (~> 0.9.0)
6
13
  mechanize (~> 2.6)
7
14
 
@@ -9,11 +16,12 @@ GEM
9
16
  remote: https://rubygems.org/
10
17
  specs:
11
18
  byebug (11.1.3)
12
- connection_pool (2.2.3)
19
+ connection_pool (2.2.5)
13
20
  diff-lcs (1.4.4)
14
21
  domain_name (0.5.20190701)
15
22
  unf (>= 0.0.5, < 1.0.0)
16
23
  fuzzyurl (0.9.0)
24
+ hpricot (0.8.6)
17
25
  http-cookie (1.0.3)
18
26
  domain_name (~> 0.5)
19
27
  mechanize (2.7.6)
@@ -30,8 +38,6 @@ GEM
30
38
  mime-types-data (3.2020.0512)
31
39
  mini_portile2 (2.4.0)
32
40
  net-http-digest_auth (1.4.1)
33
- net-http-persistent (4.0.0)
34
- connection_pool (~> 2.2)
35
41
  nokogiri (1.10.10)
36
42
  mini_portile2 (~> 2.4.0)
37
43
  ntlm-http (0.1.1)
@@ -52,6 +58,7 @@ GEM
52
58
  unf (0.1.4)
53
59
  unf_ext
54
60
  unf_ext (0.0.7.7)
61
+ webrick (1.7.0)
55
62
  webrobots (0.1.2)
56
63
 
57
64
  PLATFORMS
@@ -60,13 +67,16 @@ PLATFORMS
60
67
  DEPENDENCIES
61
68
  byebug
62
69
  fuzzyurl (~> 0.9.0)
63
- mechanize (~> 2.6)
70
+ hpricot (~> 0.8.6)
71
+ mechanize (~> 2.7)
72
+ net-http-persistent!
64
73
  rake (~> 12.0)
65
74
  rspec (~> 3.0)
66
75
  url_common!
76
+ webrick (~> 1.7)
67
77
 
68
78
  RUBY VERSION
69
- ruby 2.7.1p83
79
+ ruby 3.1.2p20
70
80
 
71
81
  BUNDLED WITH
72
82
  2.1.4
@@ -1,3 +1,3 @@
1
1
  module UrlCommon
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.5"
3
3
  end
data/lib/url_common.rb CHANGED
@@ -30,6 +30,17 @@ module UrlCommon
30
30
  end
31
31
  end
32
32
 
33
+ # UrlCommon.parse_fid_from_amazon_url("https://www.amazon.com/Original-GEN-2-0-Screwdriver-Industrial-Technician/dp/B0845919P2/?_encoding=UTF8&pd_rd_w=cekvo&content-id=amzn1.sym.bbb6bbd8-d236-47cb-b42f-734cb0cacc1f&pf_rd_p=bbb6bbd8-d236-47cb-b42f-734cb0cacc1f&pf_rd_r=3WP00V89EKYCQ1PB16VY&pd_rd_wg=HlQVt&pd_rd_r=30b33abe-2010-435e-b2cc-338f2ffbf3cf&ref_=pd_gw_ci_mcx_mi")
34
+ def self.parse_fid_from_amazon_url(url)
35
+ tmp = /\/dp\/([A-Za-z0-9]+)/.match(url)
36
+ if tmp && tmp[1]
37
+ return tmp[1]
38
+ else
39
+ return nil
40
+ end
41
+ end
42
+
43
+
33
44
  def self.parse_country_from_itunes_url(url)
34
45
  country = /https?:\/\/itunes\.apple\.com\/(..)\//.match(url)
35
46
  if country
@@ -39,9 +50,22 @@ module UrlCommon
39
50
  return 'us'
40
51
  end
41
52
 
53
+ # original
54
+ # def self.get_base_domain(url)
55
+ # parts = URI.parse(url)
56
+ # return parts.host.gsub(/^www./,'')
57
+ # end
58
+
42
59
  def self.get_base_domain(url)
43
- parts = URI.parse(url)
44
- return parts.host.gsub(/^www./,'')
60
+ #debugger if url =~ /c06rh22whx1g/
61
+ begin
62
+ url = url.gsub(/ /,'%20')
63
+ parts = URI.parse(url)
64
+ return parts.host.gsub(/^www./,'')
65
+ rescue StandardError => e
66
+ fu = Fuzzyurl.from_string(url)
67
+ return fu.hostname.gsub(/^www./,'')
68
+ end
45
69
  end
46
70
 
47
71
  def self.join(base, rest, debug = false)
@@ -60,9 +84,19 @@ module UrlCommon
60
84
  end
61
85
  end
62
86
 
63
- #TODO
64
87
  def self.count_links(html)
65
- return 0
88
+ if html =~ /<html/i
89
+ content_type = "html"
90
+ else
91
+ content_type = "ascii"
92
+ end
93
+ parts = html.split(" ")
94
+ link_ctr = 0
95
+ parts.each do |part|
96
+ link_ctr = link_ctr + 1 if part =~ /https:?\/\// && content_type == 'ascii'
97
+ link_ctr = link_ctr + 1 if part =~ /<a [^>]+.+<\/a>/i && content_type == 'html'
98
+ end
99
+ link_ctr
66
100
  end
67
101
 
68
102
  def self.agent
@@ -82,13 +116,23 @@ module UrlCommon
82
116
  #
83
117
  def self.url_base(url, base_domain=nil)
84
118
  if base_domain.nil?
85
- base_domain = get_base_domain(url)
119
+ base_domain = UrlCommon.get_base_domain(url)
120
+ end
121
+ begin
122
+ url = url.gsub(/ /,'%20')
123
+ parts = URI.parse(url)
124
+ extra = ""
125
+ extra = "?#{parts.query}" if parts.query
126
+ url_base = "#{base_domain}#{parts.path}#{extra}"
127
+ return url_base[0..254]
128
+ rescue StandardError => e
129
+ fu = Fuzzyurl.from_string(url)
130
+ base_domain = UrlCommon.get_base_domain(url)
131
+ extra = ""
132
+ extra = "?#{fu.query}" if fu.query
133
+ url_base = "#{base_domain}#{fu.path}#{extra}"
134
+ return url_base[0..254]
86
135
  end
87
- parts = URI.parse(url)
88
- extra = ""
89
- extra = "?#{parts.query}" if parts.query
90
- url_base = "#{base_domain}#{parts.path}#{extra}"
91
- return url_base[0..254]
92
136
  end
93
137
 
94
138
  #tested #https://www.amazon.com/gp/product/B01DT4A2R4/ref=as_li_qf_sp_asin_il_tl?ie=UTF8&tag=nickjanetakis-20&camp=1789&creative=9325&linkCode=as2&creativeASIN=B01DT4A2R4&linkId=496be5e222b6291369c0a393c797c2c0
@@ -262,8 +306,9 @@ module UrlCommon
262
306
  #TODO needs tests
263
307
  def self.create_mechanize_page_from_html(url, html)
264
308
  mechanize_page = Mechanize::Page.new(nil, {'content-type'=>'text/html'}, html, nil, Mechanize.new)
309
+ url = url.gsub(/ /,'%20')
265
310
  mechanize_page.uri = URI.parse(url)
266
-
311
+
267
312
  return mechanize_page
268
313
  end
269
314
 
@@ -279,6 +324,7 @@ module UrlCommon
279
324
  end
280
325
 
281
326
  #TODO needs tests
327
+ # UrlCommon.get_page_title("https://gist.github.com/fuzzygroup/811a9334b1a6dc394de74a23cb7e12fa")
282
328
  def self.get_page_title(url, html)
283
329
  page = UrlCommon.create_mechanize_page_from_html(url, html)
284
330
  title = ""
@@ -496,5 +542,26 @@ module UrlCommon
496
542
 
497
543
  return UrlCommon.select_best_rssurl_from_rssurls(results)
498
544
  end
545
+
546
+ #based on this approach: https://medium.com/@sparkboldstudio/building-a-url-shortener-rails-app-96db60d3bf9d
547
+ #def self.generate_short_fid
548
+ #def self.create_fid
549
+ def self.generate_fid
550
+ rand(36**8).to_s(36)
551
+ end
552
+
553
+ def self.sanitize(long_url, url_base)
554
+ long_url.strip!
555
+ sanitize_url = self.long_url.downcase.gsub(/(https?:\/\/)|(www\.)/,"")
556
+ #"http://#{sanitize_url}"
557
+ url_base + sanitize_url
558
+ end
559
+
560
+ # todo expand on this by checking to make sure it doesn't call again
561
+ # should be fine but ttfn
562
+ def self.create_permalink(tbl, fld, url)
563
+ short_url = UrlCommon.generate_short_url
564
+ sanitized_url = UrlCommon.sanitize(url, "https://pullquotes.io/pull_quotes/")
565
+ end
499
566
 
500
567
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_common
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Scott Johnson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-06-04 00:00:00.000000000 Z
11
+ date: 2022-07-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: fuzzyurl
@@ -84,7 +84,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
84
84
  - !ruby/object:Gem::Version
85
85
  version: '0'
86
86
  requirements: []
87
- rubygems_version: 3.1.2
87
+ rubygems_version: 3.3.7
88
88
  signing_key:
89
89
  specification_version: 4
90
90
  summary: This is a class library designed for common url manipulation and crawling