url_common 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/url_common/version.rb +1 -1
- data/lib/url_common.rb +46 -11
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b6ae6f1bbc3ec4b834e05a97701a13746db44ad172869868c962067599c12cc2
|
4
|
+
data.tar.gz: 3194ad31952c2bfc4fc5730ad766104f96a0299b3000a99a6efd32224907adfc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 32882ac9c6ee8e890daa00cd7ebedb631b99ad3b57856378a49675586a4b125bef12dd4ab116087684969c600acd63300f9689af8ee223792362aecbbbb282fd
|
7
|
+
data.tar.gz: a01f7d306fbc08f3610ea0732540d0ee80c13703d36fb5d4c3919025df9ccca92707737dfc640016086e7a179437108ddb696144bc55faa789ed6c46aa7f1184
|
data/Gemfile.lock
CHANGED
data/lib/url_common/version.rb
CHANGED
data/lib/url_common.rb
CHANGED
@@ -39,9 +39,22 @@ module UrlCommon
|
|
39
39
|
return 'us'
|
40
40
|
end
|
41
41
|
|
42
|
+
# original
|
43
|
+
# def self.get_base_domain(url)
|
44
|
+
# parts = URI.parse(url)
|
45
|
+
# return parts.host.gsub(/^www./,'')
|
46
|
+
# end
|
47
|
+
|
42
48
|
def self.get_base_domain(url)
|
43
|
-
|
44
|
-
|
49
|
+
#debugger if url =~ /c06rh22whx1g/
|
50
|
+
begin
|
51
|
+
url = url.gsub(/ /,'%20')
|
52
|
+
parts = URI.parse(url)
|
53
|
+
return parts.host.gsub(/^www./,'')
|
54
|
+
rescue StandardError => e
|
55
|
+
fu = Fuzzyurl.from_string(url)
|
56
|
+
return fu.hostname.gsub(/^www./,'')
|
57
|
+
end
|
45
58
|
end
|
46
59
|
|
47
60
|
def self.join(base, rest, debug = false)
|
@@ -60,9 +73,19 @@ module UrlCommon
|
|
60
73
|
end
|
61
74
|
end
|
62
75
|
|
63
|
-
#TODO
|
64
76
|
def self.count_links(html)
|
65
|
-
|
77
|
+
if html =~ /<html/i
|
78
|
+
content_type = "html"
|
79
|
+
else
|
80
|
+
content_type = "ascii"
|
81
|
+
end
|
82
|
+
parts = html.split(" ")
|
83
|
+
link_ctr = 0
|
84
|
+
parts.each do |part|
|
85
|
+
link_ctr = link_ctr + 1 if part =~ /https:?\/\// && content_type == 'ascii'
|
86
|
+
link_ctr = link_ctr + 1 if part =~ /<a [^>]+.+<\/a>/i && content_type == 'html'
|
87
|
+
end
|
88
|
+
link_ctr
|
66
89
|
end
|
67
90
|
|
68
91
|
def self.agent
|
@@ -82,13 +105,23 @@ module UrlCommon
|
|
82
105
|
#
|
83
106
|
def self.url_base(url, base_domain=nil)
|
84
107
|
if base_domain.nil?
|
85
|
-
base_domain = get_base_domain(url)
|
108
|
+
base_domain = UrlCommon.get_base_domain(url)
|
109
|
+
end
|
110
|
+
begin
|
111
|
+
url = url.gsub(/ /,'%20')
|
112
|
+
parts = URI.parse(url)
|
113
|
+
extra = ""
|
114
|
+
extra = "?#{parts.query}" if parts.query
|
115
|
+
url_base = "#{base_domain}#{parts.path}#{extra}"
|
116
|
+
return url_base[0..254]
|
117
|
+
rescue StandardError => e
|
118
|
+
fu = Fuzzyurl.from_string(url)
|
119
|
+
base_domain = UrlCommon.get_base_domain(url)
|
120
|
+
extra = ""
|
121
|
+
extra = "?#{fu.query}" if fu.query
|
122
|
+
url_base = "#{base_domain}#{fu.path}#{extra}"
|
123
|
+
return url_base[0..254]
|
86
124
|
end
|
87
|
-
parts = URI.parse(url)
|
88
|
-
extra = ""
|
89
|
-
extra = "?#{parts.query}" if parts.query
|
90
|
-
url_base = "#{base_domain}#{parts.path}#{extra}"
|
91
|
-
return url_base[0..254]
|
92
125
|
end
|
93
126
|
|
94
127
|
#tested #https://www.amazon.com/gp/product/B01DT4A2R4/ref=as_li_qf_sp_asin_il_tl?ie=UTF8&tag=nickjanetakis-20&camp=1789&creative=9325&linkCode=as2&creativeASIN=B01DT4A2R4&linkId=496be5e222b6291369c0a393c797c2c0
|
@@ -262,8 +295,9 @@ module UrlCommon
|
|
262
295
|
#TODO needs tests
|
263
296
|
def self.create_mechanize_page_from_html(url, html)
|
264
297
|
mechanize_page = Mechanize::Page.new(nil, {'content-type'=>'text/html'}, html, nil, Mechanize.new)
|
298
|
+
url = url.gsub(/ /,'%20')
|
265
299
|
mechanize_page.uri = URI.parse(url)
|
266
|
-
|
300
|
+
|
267
301
|
return mechanize_page
|
268
302
|
end
|
269
303
|
|
@@ -279,6 +313,7 @@ module UrlCommon
|
|
279
313
|
end
|
280
314
|
|
281
315
|
#TODO needs tests
|
316
|
+
# UrlCommon.get_page_title("https://gist.github.com/fuzzygroup/811a9334b1a6dc394de74a23cb7e12fa")
|
282
317
|
def self.get_page_title(url, html)
|
283
318
|
page = UrlCommon.create_mechanize_page_from_html(url, html)
|
284
319
|
title = ""
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_common
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Scott Johnson
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-06-
|
11
|
+
date: 2022-06-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: fuzzyurl
|