url_common 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/url_common/version.rb +1 -1
- data/lib/url_common.rb +46 -11
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b6ae6f1bbc3ec4b834e05a97701a13746db44ad172869868c962067599c12cc2
|
4
|
+
data.tar.gz: 3194ad31952c2bfc4fc5730ad766104f96a0299b3000a99a6efd32224907adfc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 32882ac9c6ee8e890daa00cd7ebedb631b99ad3b57856378a49675586a4b125bef12dd4ab116087684969c600acd63300f9689af8ee223792362aecbbbb282fd
|
7
|
+
data.tar.gz: a01f7d306fbc08f3610ea0732540d0ee80c13703d36fb5d4c3919025df9ccca92707737dfc640016086e7a179437108ddb696144bc55faa789ed6c46aa7f1184
|
data/Gemfile.lock
CHANGED
data/lib/url_common/version.rb
CHANGED
data/lib/url_common.rb
CHANGED
@@ -39,9 +39,22 @@ module UrlCommon
|
|
39
39
|
return 'us'
|
40
40
|
end
|
41
41
|
|
42
|
+
# original
|
43
|
+
# def self.get_base_domain(url)
|
44
|
+
# parts = URI.parse(url)
|
45
|
+
# return parts.host.gsub(/^www./,'')
|
46
|
+
# end
|
47
|
+
|
42
48
|
def self.get_base_domain(url)
|
43
|
-
|
44
|
-
|
49
|
+
#debugger if url =~ /c06rh22whx1g/
|
50
|
+
begin
|
51
|
+
url = url.gsub(/ /,'%20')
|
52
|
+
parts = URI.parse(url)
|
53
|
+
return parts.host.gsub(/^www./,'')
|
54
|
+
rescue StandardError => e
|
55
|
+
fu = Fuzzyurl.from_string(url)
|
56
|
+
return fu.hostname.gsub(/^www./,'')
|
57
|
+
end
|
45
58
|
end
|
46
59
|
|
47
60
|
def self.join(base, rest, debug = false)
|
@@ -60,9 +73,19 @@ module UrlCommon
|
|
60
73
|
end
|
61
74
|
end
|
62
75
|
|
63
|
-
#TODO
|
64
76
|
def self.count_links(html)
|
65
|
-
|
77
|
+
if html =~ /<html/i
|
78
|
+
content_type = "html"
|
79
|
+
else
|
80
|
+
content_type = "ascii"
|
81
|
+
end
|
82
|
+
parts = html.split(" ")
|
83
|
+
link_ctr = 0
|
84
|
+
parts.each do |part|
|
85
|
+
link_ctr = link_ctr + 1 if part =~ /https:?\/\// && content_type == 'ascii'
|
86
|
+
link_ctr = link_ctr + 1 if part =~ /<a [^>]+.+<\/a>/i && content_type == 'html'
|
87
|
+
end
|
88
|
+
link_ctr
|
66
89
|
end
|
67
90
|
|
68
91
|
def self.agent
|
@@ -82,13 +105,23 @@ module UrlCommon
|
|
82
105
|
#
|
83
106
|
def self.url_base(url, base_domain=nil)
|
84
107
|
if base_domain.nil?
|
85
|
-
base_domain = get_base_domain(url)
|
108
|
+
base_domain = UrlCommon.get_base_domain(url)
|
109
|
+
end
|
110
|
+
begin
|
111
|
+
url = url.gsub(/ /,'%20')
|
112
|
+
parts = URI.parse(url)
|
113
|
+
extra = ""
|
114
|
+
extra = "?#{parts.query}" if parts.query
|
115
|
+
url_base = "#{base_domain}#{parts.path}#{extra}"
|
116
|
+
return url_base[0..254]
|
117
|
+
rescue StandardError => e
|
118
|
+
fu = Fuzzyurl.from_string(url)
|
119
|
+
base_domain = UrlCommon.get_base_domain(url)
|
120
|
+
extra = ""
|
121
|
+
extra = "?#{fu.query}" if fu.query
|
122
|
+
url_base = "#{base_domain}#{fu.path}#{extra}"
|
123
|
+
return url_base[0..254]
|
86
124
|
end
|
87
|
-
parts = URI.parse(url)
|
88
|
-
extra = ""
|
89
|
-
extra = "?#{parts.query}" if parts.query
|
90
|
-
url_base = "#{base_domain}#{parts.path}#{extra}"
|
91
|
-
return url_base[0..254]
|
92
125
|
end
|
93
126
|
|
94
127
|
#tested #https://www.amazon.com/gp/product/B01DT4A2R4/ref=as_li_qf_sp_asin_il_tl?ie=UTF8&tag=nickjanetakis-20&camp=1789&creative=9325&linkCode=as2&creativeASIN=B01DT4A2R4&linkId=496be5e222b6291369c0a393c797c2c0
|
@@ -262,8 +295,9 @@ module UrlCommon
|
|
262
295
|
#TODO needs tests
|
263
296
|
def self.create_mechanize_page_from_html(url, html)
|
264
297
|
mechanize_page = Mechanize::Page.new(nil, {'content-type'=>'text/html'}, html, nil, Mechanize.new)
|
298
|
+
url = url.gsub(/ /,'%20')
|
265
299
|
mechanize_page.uri = URI.parse(url)
|
266
|
-
|
300
|
+
|
267
301
|
return mechanize_page
|
268
302
|
end
|
269
303
|
|
@@ -279,6 +313,7 @@ module UrlCommon
|
|
279
313
|
end
|
280
314
|
|
281
315
|
#TODO needs tests
|
316
|
+
# UrlCommon.get_page_title("https://gist.github.com/fuzzygroup/811a9334b1a6dc394de74a23cb7e12fa")
|
282
317
|
def self.get_page_title(url, html)
|
283
318
|
page = UrlCommon.create_mechanize_page_from_html(url, html)
|
284
319
|
title = ""
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_common
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Scott Johnson
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-06-
|
11
|
+
date: 2022-06-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: fuzzyurl
|