url_common 0.1.1 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +7 -2
- data/Gemfile.lock +16 -6
- data/lib/url_common/version.rb +1 -1
- data/lib/url_common.rb +78 -11
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bf016ee73428bc8dc46afb13ddf8aa1e21840650e38ac0b6c03d91c69453afca
|
4
|
+
data.tar.gz: '098c5fcaa13ee7bf0b390bfae0af56784ed25547d7b0946591c2619344f004ca'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 780507307f3b1c6745cddba4eeb37cbad88a623c8b6f694a51798c0347d80db5fffbc2210ffafe0b272e270ecc48bf036c35c7071384a656bcf0b0ca30cdc791
|
7
|
+
data.tar.gz: bd449bfe93ecdfe735c947c8d6caba91962493778cbfe899cc456a5f056db2102c3ff208084f3fbd3ce47b3383c4046d3a1c4a6c5a5504e6974a81e9b809d0cd
|
data/Gemfile
CHANGED
@@ -3,10 +3,15 @@ source "https://rubygems.org"
|
|
3
3
|
# Specify your gem's dependencies in url_common.gemspec
|
4
4
|
gemspec
|
5
5
|
|
6
|
-
ruby "
|
6
|
+
ruby "3.1.2"
|
7
7
|
|
8
8
|
gem "rake", "~> 12.0"
|
9
9
|
gem "rspec", "~> 3.0"
|
10
10
|
gem "fuzzyurl", '~> 0.9.0'
|
11
|
-
gem 'mechanize', '~> 2.6'
|
12
11
|
gem "byebug"
|
12
|
+
|
13
|
+
gem "hpricot", "~> 0.8.6"
|
14
|
+
gem 'net-http-persistent', github: 'drbrain/net-http-persistent'
|
15
|
+
gem "mechanize", "~> 2.7"
|
16
|
+
|
17
|
+
gem "webrick", "~> 1.7"
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,14 @@
|
|
1
|
+
GIT
|
2
|
+
remote: https://github.com/drbrain/net-http-persistent.git
|
3
|
+
revision: 857c3baaa541644fa437328b535042a500414119
|
4
|
+
specs:
|
5
|
+
net-http-persistent (4.0.1)
|
6
|
+
connection_pool (~> 2.2)
|
7
|
+
|
1
8
|
PATH
|
2
9
|
remote: .
|
3
10
|
specs:
|
4
|
-
url_common (0.1.
|
11
|
+
url_common (0.1.5)
|
5
12
|
fuzzyurl (~> 0.9.0)
|
6
13
|
mechanize (~> 2.6)
|
7
14
|
|
@@ -9,11 +16,12 @@ GEM
|
|
9
16
|
remote: https://rubygems.org/
|
10
17
|
specs:
|
11
18
|
byebug (11.1.3)
|
12
|
-
connection_pool (2.2.
|
19
|
+
connection_pool (2.2.5)
|
13
20
|
diff-lcs (1.4.4)
|
14
21
|
domain_name (0.5.20190701)
|
15
22
|
unf (>= 0.0.5, < 1.0.0)
|
16
23
|
fuzzyurl (0.9.0)
|
24
|
+
hpricot (0.8.6)
|
17
25
|
http-cookie (1.0.3)
|
18
26
|
domain_name (~> 0.5)
|
19
27
|
mechanize (2.7.6)
|
@@ -30,8 +38,6 @@ GEM
|
|
30
38
|
mime-types-data (3.2020.0512)
|
31
39
|
mini_portile2 (2.4.0)
|
32
40
|
net-http-digest_auth (1.4.1)
|
33
|
-
net-http-persistent (4.0.0)
|
34
|
-
connection_pool (~> 2.2)
|
35
41
|
nokogiri (1.10.10)
|
36
42
|
mini_portile2 (~> 2.4.0)
|
37
43
|
ntlm-http (0.1.1)
|
@@ -52,6 +58,7 @@ GEM
|
|
52
58
|
unf (0.1.4)
|
53
59
|
unf_ext
|
54
60
|
unf_ext (0.0.7.7)
|
61
|
+
webrick (1.7.0)
|
55
62
|
webrobots (0.1.2)
|
56
63
|
|
57
64
|
PLATFORMS
|
@@ -60,13 +67,16 @@ PLATFORMS
|
|
60
67
|
DEPENDENCIES
|
61
68
|
byebug
|
62
69
|
fuzzyurl (~> 0.9.0)
|
63
|
-
|
70
|
+
hpricot (~> 0.8.6)
|
71
|
+
mechanize (~> 2.7)
|
72
|
+
net-http-persistent!
|
64
73
|
rake (~> 12.0)
|
65
74
|
rspec (~> 3.0)
|
66
75
|
url_common!
|
76
|
+
webrick (~> 1.7)
|
67
77
|
|
68
78
|
RUBY VERSION
|
69
|
-
ruby
|
79
|
+
ruby 3.1.2p20
|
70
80
|
|
71
81
|
BUNDLED WITH
|
72
82
|
2.1.4
|
data/lib/url_common/version.rb
CHANGED
data/lib/url_common.rb
CHANGED
@@ -30,6 +30,17 @@ module UrlCommon
|
|
30
30
|
end
|
31
31
|
end
|
32
32
|
|
33
|
+
# UrlCommon.parse_fid_from_amazon_url("https://www.amazon.com/Original-GEN-2-0-Screwdriver-Industrial-Technician/dp/B0845919P2/?_encoding=UTF8&pd_rd_w=cekvo&content-id=amzn1.sym.bbb6bbd8-d236-47cb-b42f-734cb0cacc1f&pf_rd_p=bbb6bbd8-d236-47cb-b42f-734cb0cacc1f&pf_rd_r=3WP00V89EKYCQ1PB16VY&pd_rd_wg=HlQVt&pd_rd_r=30b33abe-2010-435e-b2cc-338f2ffbf3cf&ref_=pd_gw_ci_mcx_mi")
|
34
|
+
def self.parse_fid_from_amazon_url(url)
|
35
|
+
tmp = /\/dp\/([A-Za-z0-9]+)/.match(url)
|
36
|
+
if tmp && tmp[1]
|
37
|
+
return tmp[1]
|
38
|
+
else
|
39
|
+
return nil
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
|
33
44
|
def self.parse_country_from_itunes_url(url)
|
34
45
|
country = /https?:\/\/itunes\.apple\.com\/(..)\//.match(url)
|
35
46
|
if country
|
@@ -39,9 +50,22 @@ module UrlCommon
|
|
39
50
|
return 'us'
|
40
51
|
end
|
41
52
|
|
53
|
+
# original
|
54
|
+
# def self.get_base_domain(url)
|
55
|
+
# parts = URI.parse(url)
|
56
|
+
# return parts.host.gsub(/^www./,'')
|
57
|
+
# end
|
58
|
+
|
42
59
|
def self.get_base_domain(url)
|
43
|
-
|
44
|
-
|
60
|
+
#debugger if url =~ /c06rh22whx1g/
|
61
|
+
begin
|
62
|
+
url = url.gsub(/ /,'%20')
|
63
|
+
parts = URI.parse(url)
|
64
|
+
return parts.host.gsub(/^www./,'')
|
65
|
+
rescue StandardError => e
|
66
|
+
fu = Fuzzyurl.from_string(url)
|
67
|
+
return fu.hostname.gsub(/^www./,'')
|
68
|
+
end
|
45
69
|
end
|
46
70
|
|
47
71
|
def self.join(base, rest, debug = false)
|
@@ -60,9 +84,19 @@ module UrlCommon
|
|
60
84
|
end
|
61
85
|
end
|
62
86
|
|
63
|
-
#TODO
|
64
87
|
def self.count_links(html)
|
65
|
-
|
88
|
+
if html =~ /<html/i
|
89
|
+
content_type = "html"
|
90
|
+
else
|
91
|
+
content_type = "ascii"
|
92
|
+
end
|
93
|
+
parts = html.split(" ")
|
94
|
+
link_ctr = 0
|
95
|
+
parts.each do |part|
|
96
|
+
link_ctr = link_ctr + 1 if part =~ /https:?\/\// && content_type == 'ascii'
|
97
|
+
link_ctr = link_ctr + 1 if part =~ /<a [^>]+.+<\/a>/i && content_type == 'html'
|
98
|
+
end
|
99
|
+
link_ctr
|
66
100
|
end
|
67
101
|
|
68
102
|
def self.agent
|
@@ -82,13 +116,23 @@ module UrlCommon
|
|
82
116
|
#
|
83
117
|
def self.url_base(url, base_domain=nil)
|
84
118
|
if base_domain.nil?
|
85
|
-
base_domain = get_base_domain(url)
|
119
|
+
base_domain = UrlCommon.get_base_domain(url)
|
120
|
+
end
|
121
|
+
begin
|
122
|
+
url = url.gsub(/ /,'%20')
|
123
|
+
parts = URI.parse(url)
|
124
|
+
extra = ""
|
125
|
+
extra = "?#{parts.query}" if parts.query
|
126
|
+
url_base = "#{base_domain}#{parts.path}#{extra}"
|
127
|
+
return url_base[0..254]
|
128
|
+
rescue StandardError => e
|
129
|
+
fu = Fuzzyurl.from_string(url)
|
130
|
+
base_domain = UrlCommon.get_base_domain(url)
|
131
|
+
extra = ""
|
132
|
+
extra = "?#{fu.query}" if fu.query
|
133
|
+
url_base = "#{base_domain}#{fu.path}#{extra}"
|
134
|
+
return url_base[0..254]
|
86
135
|
end
|
87
|
-
parts = URI.parse(url)
|
88
|
-
extra = ""
|
89
|
-
extra = "?#{parts.query}" if parts.query
|
90
|
-
url_base = "#{base_domain}#{parts.path}#{extra}"
|
91
|
-
return url_base[0..254]
|
92
136
|
end
|
93
137
|
|
94
138
|
#tested #https://www.amazon.com/gp/product/B01DT4A2R4/ref=as_li_qf_sp_asin_il_tl?ie=UTF8&tag=nickjanetakis-20&camp=1789&creative=9325&linkCode=as2&creativeASIN=B01DT4A2R4&linkId=496be5e222b6291369c0a393c797c2c0
|
@@ -262,8 +306,9 @@ module UrlCommon
|
|
262
306
|
#TODO needs tests
|
263
307
|
def self.create_mechanize_page_from_html(url, html)
|
264
308
|
mechanize_page = Mechanize::Page.new(nil, {'content-type'=>'text/html'}, html, nil, Mechanize.new)
|
309
|
+
url = url.gsub(/ /,'%20')
|
265
310
|
mechanize_page.uri = URI.parse(url)
|
266
|
-
|
311
|
+
|
267
312
|
return mechanize_page
|
268
313
|
end
|
269
314
|
|
@@ -279,6 +324,7 @@ module UrlCommon
|
|
279
324
|
end
|
280
325
|
|
281
326
|
#TODO needs tests
|
327
|
+
# UrlCommon.get_page_title("https://gist.github.com/fuzzygroup/811a9334b1a6dc394de74a23cb7e12fa")
|
282
328
|
def self.get_page_title(url, html)
|
283
329
|
page = UrlCommon.create_mechanize_page_from_html(url, html)
|
284
330
|
title = ""
|
@@ -496,5 +542,26 @@ module UrlCommon
|
|
496
542
|
|
497
543
|
return UrlCommon.select_best_rssurl_from_rssurls(results)
|
498
544
|
end
|
545
|
+
|
546
|
+
#based on this approach: https://medium.com/@sparkboldstudio/building-a-url-shortener-rails-app-96db60d3bf9d
|
547
|
+
#def self.generate_short_fid
|
548
|
+
#def self.create_fid
|
549
|
+
def self.generate_fid
|
550
|
+
rand(36**8).to_s(36)
|
551
|
+
end
|
552
|
+
|
553
|
+
def self.sanitize(long_url, url_base)
|
554
|
+
long_url.strip!
|
555
|
+
sanitize_url = self.long_url.downcase.gsub(/(https?:\/\/)|(www\.)/,"")
|
556
|
+
#"http://#{sanitize_url}"
|
557
|
+
url_base + sanitize_url
|
558
|
+
end
|
559
|
+
|
560
|
+
# todo expand on this by checking to make sure it doesn't call again
|
561
|
+
# should be fine but ttfn
|
562
|
+
def self.create_permalink(tbl, fld, url)
|
563
|
+
short_url = UrlCommon.generate_short_url
|
564
|
+
sanitized_url = UrlCommon.sanitize(url, "https://pullquotes.io/pull_quotes/")
|
565
|
+
end
|
499
566
|
|
500
567
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_common
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Scott Johnson
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-07-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: fuzzyurl
|
@@ -84,7 +84,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
84
84
|
- !ruby/object:Gem::Version
|
85
85
|
version: '0'
|
86
86
|
requirements: []
|
87
|
-
rubygems_version: 3.
|
87
|
+
rubygems_version: 3.3.7
|
88
88
|
signing_key:
|
89
89
|
specification_version: 4
|
90
90
|
summary: This is a class library designed for common url manipulation and crawling
|