url_common 0.1.1 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +7 -2
- data/Gemfile.lock +16 -6
- data/lib/url_common/version.rb +1 -1
- data/lib/url_common.rb +78 -11
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bf016ee73428bc8dc46afb13ddf8aa1e21840650e38ac0b6c03d91c69453afca
|
4
|
+
data.tar.gz: '098c5fcaa13ee7bf0b390bfae0af56784ed25547d7b0946591c2619344f004ca'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 780507307f3b1c6745cddba4eeb37cbad88a623c8b6f694a51798c0347d80db5fffbc2210ffafe0b272e270ecc48bf036c35c7071384a656bcf0b0ca30cdc791
|
7
|
+
data.tar.gz: bd449bfe93ecdfe735c947c8d6caba91962493778cbfe899cc456a5f056db2102c3ff208084f3fbd3ce47b3383c4046d3a1c4a6c5a5504e6974a81e9b809d0cd
|
data/Gemfile
CHANGED
@@ -3,10 +3,15 @@ source "https://rubygems.org"
|
|
3
3
|
# Specify your gem's dependencies in url_common.gemspec
|
4
4
|
gemspec
|
5
5
|
|
6
|
-
ruby "
|
6
|
+
ruby "3.1.2"
|
7
7
|
|
8
8
|
gem "rake", "~> 12.0"
|
9
9
|
gem "rspec", "~> 3.0"
|
10
10
|
gem "fuzzyurl", '~> 0.9.0'
|
11
|
-
gem 'mechanize', '~> 2.6'
|
12
11
|
gem "byebug"
|
12
|
+
|
13
|
+
gem "hpricot", "~> 0.8.6"
|
14
|
+
gem 'net-http-persistent', github: 'drbrain/net-http-persistent'
|
15
|
+
gem "mechanize", "~> 2.7"
|
16
|
+
|
17
|
+
gem "webrick", "~> 1.7"
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,14 @@
|
|
1
|
+
GIT
|
2
|
+
remote: https://github.com/drbrain/net-http-persistent.git
|
3
|
+
revision: 857c3baaa541644fa437328b535042a500414119
|
4
|
+
specs:
|
5
|
+
net-http-persistent (4.0.1)
|
6
|
+
connection_pool (~> 2.2)
|
7
|
+
|
1
8
|
PATH
|
2
9
|
remote: .
|
3
10
|
specs:
|
4
|
-
url_common (0.1.
|
11
|
+
url_common (0.1.5)
|
5
12
|
fuzzyurl (~> 0.9.0)
|
6
13
|
mechanize (~> 2.6)
|
7
14
|
|
@@ -9,11 +16,12 @@ GEM
|
|
9
16
|
remote: https://rubygems.org/
|
10
17
|
specs:
|
11
18
|
byebug (11.1.3)
|
12
|
-
connection_pool (2.2.
|
19
|
+
connection_pool (2.2.5)
|
13
20
|
diff-lcs (1.4.4)
|
14
21
|
domain_name (0.5.20190701)
|
15
22
|
unf (>= 0.0.5, < 1.0.0)
|
16
23
|
fuzzyurl (0.9.0)
|
24
|
+
hpricot (0.8.6)
|
17
25
|
http-cookie (1.0.3)
|
18
26
|
domain_name (~> 0.5)
|
19
27
|
mechanize (2.7.6)
|
@@ -30,8 +38,6 @@ GEM
|
|
30
38
|
mime-types-data (3.2020.0512)
|
31
39
|
mini_portile2 (2.4.0)
|
32
40
|
net-http-digest_auth (1.4.1)
|
33
|
-
net-http-persistent (4.0.0)
|
34
|
-
connection_pool (~> 2.2)
|
35
41
|
nokogiri (1.10.10)
|
36
42
|
mini_portile2 (~> 2.4.0)
|
37
43
|
ntlm-http (0.1.1)
|
@@ -52,6 +58,7 @@ GEM
|
|
52
58
|
unf (0.1.4)
|
53
59
|
unf_ext
|
54
60
|
unf_ext (0.0.7.7)
|
61
|
+
webrick (1.7.0)
|
55
62
|
webrobots (0.1.2)
|
56
63
|
|
57
64
|
PLATFORMS
|
@@ -60,13 +67,16 @@ PLATFORMS
|
|
60
67
|
DEPENDENCIES
|
61
68
|
byebug
|
62
69
|
fuzzyurl (~> 0.9.0)
|
63
|
-
|
70
|
+
hpricot (~> 0.8.6)
|
71
|
+
mechanize (~> 2.7)
|
72
|
+
net-http-persistent!
|
64
73
|
rake (~> 12.0)
|
65
74
|
rspec (~> 3.0)
|
66
75
|
url_common!
|
76
|
+
webrick (~> 1.7)
|
67
77
|
|
68
78
|
RUBY VERSION
|
69
|
-
ruby
|
79
|
+
ruby 3.1.2p20
|
70
80
|
|
71
81
|
BUNDLED WITH
|
72
82
|
2.1.4
|
data/lib/url_common/version.rb
CHANGED
data/lib/url_common.rb
CHANGED
@@ -30,6 +30,17 @@ module UrlCommon
|
|
30
30
|
end
|
31
31
|
end
|
32
32
|
|
33
|
+
# UrlCommon.parse_fid_from_amazon_url("https://www.amazon.com/Original-GEN-2-0-Screwdriver-Industrial-Technician/dp/B0845919P2/?_encoding=UTF8&pd_rd_w=cekvo&content-id=amzn1.sym.bbb6bbd8-d236-47cb-b42f-734cb0cacc1f&pf_rd_p=bbb6bbd8-d236-47cb-b42f-734cb0cacc1f&pf_rd_r=3WP00V89EKYCQ1PB16VY&pd_rd_wg=HlQVt&pd_rd_r=30b33abe-2010-435e-b2cc-338f2ffbf3cf&ref_=pd_gw_ci_mcx_mi")
|
34
|
+
def self.parse_fid_from_amazon_url(url)
|
35
|
+
tmp = /\/dp\/([A-Za-z0-9]+)/.match(url)
|
36
|
+
if tmp && tmp[1]
|
37
|
+
return tmp[1]
|
38
|
+
else
|
39
|
+
return nil
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
|
33
44
|
def self.parse_country_from_itunes_url(url)
|
34
45
|
country = /https?:\/\/itunes\.apple\.com\/(..)\//.match(url)
|
35
46
|
if country
|
@@ -39,9 +50,22 @@ module UrlCommon
|
|
39
50
|
return 'us'
|
40
51
|
end
|
41
52
|
|
53
|
+
# original
|
54
|
+
# def self.get_base_domain(url)
|
55
|
+
# parts = URI.parse(url)
|
56
|
+
# return parts.host.gsub(/^www./,'')
|
57
|
+
# end
|
58
|
+
|
42
59
|
def self.get_base_domain(url)
|
43
|
-
|
44
|
-
|
60
|
+
#debugger if url =~ /c06rh22whx1g/
|
61
|
+
begin
|
62
|
+
url = url.gsub(/ /,'%20')
|
63
|
+
parts = URI.parse(url)
|
64
|
+
return parts.host.gsub(/^www./,'')
|
65
|
+
rescue StandardError => e
|
66
|
+
fu = Fuzzyurl.from_string(url)
|
67
|
+
return fu.hostname.gsub(/^www./,'')
|
68
|
+
end
|
45
69
|
end
|
46
70
|
|
47
71
|
def self.join(base, rest, debug = false)
|
@@ -60,9 +84,19 @@ module UrlCommon
|
|
60
84
|
end
|
61
85
|
end
|
62
86
|
|
63
|
-
#TODO
|
64
87
|
def self.count_links(html)
|
65
|
-
|
88
|
+
if html =~ /<html/i
|
89
|
+
content_type = "html"
|
90
|
+
else
|
91
|
+
content_type = "ascii"
|
92
|
+
end
|
93
|
+
parts = html.split(" ")
|
94
|
+
link_ctr = 0
|
95
|
+
parts.each do |part|
|
96
|
+
link_ctr = link_ctr + 1 if part =~ /https:?\/\// && content_type == 'ascii'
|
97
|
+
link_ctr = link_ctr + 1 if part =~ /<a [^>]+.+<\/a>/i && content_type == 'html'
|
98
|
+
end
|
99
|
+
link_ctr
|
66
100
|
end
|
67
101
|
|
68
102
|
def self.agent
|
@@ -82,13 +116,23 @@ module UrlCommon
|
|
82
116
|
#
|
83
117
|
def self.url_base(url, base_domain=nil)
|
84
118
|
if base_domain.nil?
|
85
|
-
base_domain = get_base_domain(url)
|
119
|
+
base_domain = UrlCommon.get_base_domain(url)
|
120
|
+
end
|
121
|
+
begin
|
122
|
+
url = url.gsub(/ /,'%20')
|
123
|
+
parts = URI.parse(url)
|
124
|
+
extra = ""
|
125
|
+
extra = "?#{parts.query}" if parts.query
|
126
|
+
url_base = "#{base_domain}#{parts.path}#{extra}"
|
127
|
+
return url_base[0..254]
|
128
|
+
rescue StandardError => e
|
129
|
+
fu = Fuzzyurl.from_string(url)
|
130
|
+
base_domain = UrlCommon.get_base_domain(url)
|
131
|
+
extra = ""
|
132
|
+
extra = "?#{fu.query}" if fu.query
|
133
|
+
url_base = "#{base_domain}#{fu.path}#{extra}"
|
134
|
+
return url_base[0..254]
|
86
135
|
end
|
87
|
-
parts = URI.parse(url)
|
88
|
-
extra = ""
|
89
|
-
extra = "?#{parts.query}" if parts.query
|
90
|
-
url_base = "#{base_domain}#{parts.path}#{extra}"
|
91
|
-
return url_base[0..254]
|
92
136
|
end
|
93
137
|
|
94
138
|
#tested #https://www.amazon.com/gp/product/B01DT4A2R4/ref=as_li_qf_sp_asin_il_tl?ie=UTF8&tag=nickjanetakis-20&camp=1789&creative=9325&linkCode=as2&creativeASIN=B01DT4A2R4&linkId=496be5e222b6291369c0a393c797c2c0
|
@@ -262,8 +306,9 @@ module UrlCommon
|
|
262
306
|
#TODO needs tests
|
263
307
|
def self.create_mechanize_page_from_html(url, html)
|
264
308
|
mechanize_page = Mechanize::Page.new(nil, {'content-type'=>'text/html'}, html, nil, Mechanize.new)
|
309
|
+
url = url.gsub(/ /,'%20')
|
265
310
|
mechanize_page.uri = URI.parse(url)
|
266
|
-
|
311
|
+
|
267
312
|
return mechanize_page
|
268
313
|
end
|
269
314
|
|
@@ -279,6 +324,7 @@ module UrlCommon
|
|
279
324
|
end
|
280
325
|
|
281
326
|
#TODO needs tests
|
327
|
+
# UrlCommon.get_page_title("https://gist.github.com/fuzzygroup/811a9334b1a6dc394de74a23cb7e12fa")
|
282
328
|
def self.get_page_title(url, html)
|
283
329
|
page = UrlCommon.create_mechanize_page_from_html(url, html)
|
284
330
|
title = ""
|
@@ -496,5 +542,26 @@ module UrlCommon
|
|
496
542
|
|
497
543
|
return UrlCommon.select_best_rssurl_from_rssurls(results)
|
498
544
|
end
|
545
|
+
|
546
|
+
#based on this approach: https://medium.com/@sparkboldstudio/building-a-url-shortener-rails-app-96db60d3bf9d
|
547
|
+
#def self.generate_short_fid
|
548
|
+
#def self.create_fid
|
549
|
+
def self.generate_fid
|
550
|
+
rand(36**8).to_s(36)
|
551
|
+
end
|
552
|
+
|
553
|
+
def self.sanitize(long_url, url_base)
|
554
|
+
long_url.strip!
|
555
|
+
sanitize_url = self.long_url.downcase.gsub(/(https?:\/\/)|(www\.)/,"")
|
556
|
+
#"http://#{sanitize_url}"
|
557
|
+
url_base + sanitize_url
|
558
|
+
end
|
559
|
+
|
560
|
+
# todo expand on this by checking to make sure it doesn't call again
|
561
|
+
# should be fine but ttfn
|
562
|
+
def self.create_permalink(tbl, fld, url)
|
563
|
+
short_url = UrlCommon.generate_short_url
|
564
|
+
sanitized_url = UrlCommon.sanitize(url, "https://pullquotes.io/pull_quotes/")
|
565
|
+
end
|
499
566
|
|
500
567
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_common
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Scott Johnson
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-07-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: fuzzyurl
|
@@ -84,7 +84,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
84
84
|
- !ruby/object:Gem::Version
|
85
85
|
version: '0'
|
86
86
|
requirements: []
|
87
|
-
rubygems_version: 3.
|
87
|
+
rubygems_version: 3.3.7
|
88
88
|
signing_key:
|
89
89
|
specification_version: 4
|
90
90
|
summary: This is a class library designed for common url manipulation and crawling
|