url_common 0.1.1 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e75ba5e9d78301e4cf753f4ade0a54b036aaca17d7527791b984d8bc81215a5a
4
- data.tar.gz: 7bae7598dd4884c9c795e511c32c02cca9095a32f23522e0241842631250488b
3
+ metadata.gz: bf016ee73428bc8dc46afb13ddf8aa1e21840650e38ac0b6c03d91c69453afca
4
+ data.tar.gz: '098c5fcaa13ee7bf0b390bfae0af56784ed25547d7b0946591c2619344f004ca'
5
5
  SHA512:
6
- metadata.gz: d633158a6b206eca935948a72ebcce4bf90a196c373eca02ae9895819f5b027333b60e4dc5ec84f7b6507d4f468ac1752d36ca367abae4b03b65fd6665857095
7
- data.tar.gz: a48dc2eace4656f5b2e16957174ae399633d97ddcff830d799ce31dee94ebfcbe648a4e3a1eb78c0ea8a91f63d8df22df783eed7f99d21920a2ea6a3b6edef44
6
+ metadata.gz: 780507307f3b1c6745cddba4eeb37cbad88a623c8b6f694a51798c0347d80db5fffbc2210ffafe0b272e270ecc48bf036c35c7071384a656bcf0b0ca30cdc791
7
+ data.tar.gz: bd449bfe93ecdfe735c947c8d6caba91962493778cbfe899cc456a5f056db2102c3ff208084f3fbd3ce47b3383c4046d3a1c4a6c5a5504e6974a81e9b809d0cd
data/Gemfile CHANGED
@@ -3,10 +3,15 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in url_common.gemspec
4
4
  gemspec
5
5
 
6
- ruby "2.7.1"
6
+ ruby "3.1.2"
7
7
 
8
8
  gem "rake", "~> 12.0"
9
9
  gem "rspec", "~> 3.0"
10
10
  gem "fuzzyurl", '~> 0.9.0'
11
- gem 'mechanize', '~> 2.6'
12
11
  gem "byebug"
12
+
13
+ gem "hpricot", "~> 0.8.6"
14
+ gem 'net-http-persistent', github: 'drbrain/net-http-persistent'
15
+ gem "mechanize", "~> 2.7"
16
+
17
+ gem "webrick", "~> 1.7"
data/Gemfile.lock CHANGED
@@ -1,7 +1,14 @@
1
+ GIT
2
+ remote: https://github.com/drbrain/net-http-persistent.git
3
+ revision: 857c3baaa541644fa437328b535042a500414119
4
+ specs:
5
+ net-http-persistent (4.0.1)
6
+ connection_pool (~> 2.2)
7
+
1
8
  PATH
2
9
  remote: .
3
10
  specs:
4
- url_common (0.1.1)
11
+ url_common (0.1.5)
5
12
  fuzzyurl (~> 0.9.0)
6
13
  mechanize (~> 2.6)
7
14
 
@@ -9,11 +16,12 @@ GEM
9
16
  remote: https://rubygems.org/
10
17
  specs:
11
18
  byebug (11.1.3)
12
- connection_pool (2.2.3)
19
+ connection_pool (2.2.5)
13
20
  diff-lcs (1.4.4)
14
21
  domain_name (0.5.20190701)
15
22
  unf (>= 0.0.5, < 1.0.0)
16
23
  fuzzyurl (0.9.0)
24
+ hpricot (0.8.6)
17
25
  http-cookie (1.0.3)
18
26
  domain_name (~> 0.5)
19
27
  mechanize (2.7.6)
@@ -30,8 +38,6 @@ GEM
30
38
  mime-types-data (3.2020.0512)
31
39
  mini_portile2 (2.4.0)
32
40
  net-http-digest_auth (1.4.1)
33
- net-http-persistent (4.0.0)
34
- connection_pool (~> 2.2)
35
41
  nokogiri (1.10.10)
36
42
  mini_portile2 (~> 2.4.0)
37
43
  ntlm-http (0.1.1)
@@ -52,6 +58,7 @@ GEM
52
58
  unf (0.1.4)
53
59
  unf_ext
54
60
  unf_ext (0.0.7.7)
61
+ webrick (1.7.0)
55
62
  webrobots (0.1.2)
56
63
 
57
64
  PLATFORMS
@@ -60,13 +67,16 @@ PLATFORMS
60
67
  DEPENDENCIES
61
68
  byebug
62
69
  fuzzyurl (~> 0.9.0)
63
- mechanize (~> 2.6)
70
+ hpricot (~> 0.8.6)
71
+ mechanize (~> 2.7)
72
+ net-http-persistent!
64
73
  rake (~> 12.0)
65
74
  rspec (~> 3.0)
66
75
  url_common!
76
+ webrick (~> 1.7)
67
77
 
68
78
  RUBY VERSION
69
- ruby 2.7.1p83
79
+ ruby 3.1.2p20
70
80
 
71
81
  BUNDLED WITH
72
82
  2.1.4
@@ -1,3 +1,3 @@
1
1
  module UrlCommon
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.5"
3
3
  end
data/lib/url_common.rb CHANGED
@@ -30,6 +30,17 @@ module UrlCommon
30
30
  end
31
31
  end
32
32
 
33
+ # UrlCommon.parse_fid_from_amazon_url("https://www.amazon.com/Original-GEN-2-0-Screwdriver-Industrial-Technician/dp/B0845919P2/?_encoding=UTF8&pd_rd_w=cekvo&content-id=amzn1.sym.bbb6bbd8-d236-47cb-b42f-734cb0cacc1f&pf_rd_p=bbb6bbd8-d236-47cb-b42f-734cb0cacc1f&pf_rd_r=3WP00V89EKYCQ1PB16VY&pd_rd_wg=HlQVt&pd_rd_r=30b33abe-2010-435e-b2cc-338f2ffbf3cf&ref_=pd_gw_ci_mcx_mi")
34
+ def self.parse_fid_from_amazon_url(url)
35
+ tmp = /\/dp\/([A-Za-z0-9]+)/.match(url)
36
+ if tmp && tmp[1]
37
+ return tmp[1]
38
+ else
39
+ return nil
40
+ end
41
+ end
42
+
43
+
33
44
  def self.parse_country_from_itunes_url(url)
34
45
  country = /https?:\/\/itunes\.apple\.com\/(..)\//.match(url)
35
46
  if country
@@ -39,9 +50,22 @@ module UrlCommon
39
50
  return 'us'
40
51
  end
41
52
 
53
+ # original
54
+ # def self.get_base_domain(url)
55
+ # parts = URI.parse(url)
56
+ # return parts.host.gsub(/^www./,'')
57
+ # end
58
+
42
59
  def self.get_base_domain(url)
43
- parts = URI.parse(url)
44
- return parts.host.gsub(/^www./,'')
60
+ #debugger if url =~ /c06rh22whx1g/
61
+ begin
62
+ url = url.gsub(/ /,'%20')
63
+ parts = URI.parse(url)
64
+ return parts.host.gsub(/^www./,'')
65
+ rescue StandardError => e
66
+ fu = Fuzzyurl.from_string(url)
67
+ return fu.hostname.gsub(/^www./,'')
68
+ end
45
69
  end
46
70
 
47
71
  def self.join(base, rest, debug = false)
@@ -60,9 +84,19 @@ module UrlCommon
60
84
  end
61
85
  end
62
86
 
63
- #TODO
64
87
  def self.count_links(html)
65
- return 0
88
+ if html =~ /<html/i
89
+ content_type = "html"
90
+ else
91
+ content_type = "ascii"
92
+ end
93
+ parts = html.split(" ")
94
+ link_ctr = 0
95
+ parts.each do |part|
96
+ link_ctr = link_ctr + 1 if part =~ /https:?\/\// && content_type == 'ascii'
97
+ link_ctr = link_ctr + 1 if part =~ /<a [^>]+.+<\/a>/i && content_type == 'html'
98
+ end
99
+ link_ctr
66
100
  end
67
101
 
68
102
  def self.agent
@@ -82,13 +116,23 @@ module UrlCommon
82
116
  #
83
117
  def self.url_base(url, base_domain=nil)
84
118
  if base_domain.nil?
85
- base_domain = get_base_domain(url)
119
+ base_domain = UrlCommon.get_base_domain(url)
120
+ end
121
+ begin
122
+ url = url.gsub(/ /,'%20')
123
+ parts = URI.parse(url)
124
+ extra = ""
125
+ extra = "?#{parts.query}" if parts.query
126
+ url_base = "#{base_domain}#{parts.path}#{extra}"
127
+ return url_base[0..254]
128
+ rescue StandardError => e
129
+ fu = Fuzzyurl.from_string(url)
130
+ base_domain = UrlCommon.get_base_domain(url)
131
+ extra = ""
132
+ extra = "?#{fu.query}" if fu.query
133
+ url_base = "#{base_domain}#{fu.path}#{extra}"
134
+ return url_base[0..254]
86
135
  end
87
- parts = URI.parse(url)
88
- extra = ""
89
- extra = "?#{parts.query}" if parts.query
90
- url_base = "#{base_domain}#{parts.path}#{extra}"
91
- return url_base[0..254]
92
136
  end
93
137
 
94
138
  #tested #https://www.amazon.com/gp/product/B01DT4A2R4/ref=as_li_qf_sp_asin_il_tl?ie=UTF8&tag=nickjanetakis-20&camp=1789&creative=9325&linkCode=as2&creativeASIN=B01DT4A2R4&linkId=496be5e222b6291369c0a393c797c2c0
@@ -262,8 +306,9 @@ module UrlCommon
262
306
  #TODO needs tests
263
307
  def self.create_mechanize_page_from_html(url, html)
264
308
  mechanize_page = Mechanize::Page.new(nil, {'content-type'=>'text/html'}, html, nil, Mechanize.new)
309
+ url = url.gsub(/ /,'%20')
265
310
  mechanize_page.uri = URI.parse(url)
266
-
311
+
267
312
  return mechanize_page
268
313
  end
269
314
 
@@ -279,6 +324,7 @@ module UrlCommon
279
324
  end
280
325
 
281
326
  #TODO needs tests
327
+ # UrlCommon.get_page_title("https://gist.github.com/fuzzygroup/811a9334b1a6dc394de74a23cb7e12fa")
282
328
  def self.get_page_title(url, html)
283
329
  page = UrlCommon.create_mechanize_page_from_html(url, html)
284
330
  title = ""
@@ -496,5 +542,26 @@ module UrlCommon
496
542
 
497
543
  return UrlCommon.select_best_rssurl_from_rssurls(results)
498
544
  end
545
+
546
+ #based on this approach: https://medium.com/@sparkboldstudio/building-a-url-shortener-rails-app-96db60d3bf9d
547
+ #def self.generate_short_fid
548
+ #def self.create_fid
549
+ def self.generate_fid
550
+ rand(36**8).to_s(36)
551
+ end
552
+
553
+ def self.sanitize(long_url, url_base)
554
+ long_url.strip!
555
+ sanitize_url = self.long_url.downcase.gsub(/(https?:\/\/)|(www\.)/,"")
556
+ #"http://#{sanitize_url}"
557
+ url_base + sanitize_url
558
+ end
559
+
560
+ # todo expand on this by checking to make sure it doesn't call again
561
+ # should be fine but ttfn
562
+ def self.create_permalink(tbl, fld, url)
563
+ short_url = UrlCommon.generate_short_url
564
+ sanitized_url = UrlCommon.sanitize(url, "https://pullquotes.io/pull_quotes/")
565
+ end
499
566
 
500
567
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_common
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Scott Johnson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-06-04 00:00:00.000000000 Z
11
+ date: 2022-07-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: fuzzyurl
@@ -84,7 +84,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
84
84
  - !ruby/object:Gem::Version
85
85
  version: '0'
86
86
  requirements: []
87
- rubygems_version: 3.1.2
87
+ rubygems_version: 3.3.7
88
88
  signing_key:
89
89
  specification_version: 4
90
90
  summary: This is a class library designed for common url manipulation and crawling