url_common 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e75ba5e9d78301e4cf753f4ade0a54b036aaca17d7527791b984d8bc81215a5a
4
- data.tar.gz: 7bae7598dd4884c9c795e511c32c02cca9095a32f23522e0241842631250488b
3
+ metadata.gz: b6ae6f1bbc3ec4b834e05a97701a13746db44ad172869868c962067599c12cc2
4
+ data.tar.gz: 3194ad31952c2bfc4fc5730ad766104f96a0299b3000a99a6efd32224907adfc
5
5
  SHA512:
6
- metadata.gz: d633158a6b206eca935948a72ebcce4bf90a196c373eca02ae9895819f5b027333b60e4dc5ec84f7b6507d4f468ac1752d36ca367abae4b03b65fd6665857095
7
- data.tar.gz: a48dc2eace4656f5b2e16957174ae399633d97ddcff830d799ce31dee94ebfcbe648a4e3a1eb78c0ea8a91f63d8df22df783eed7f99d21920a2ea6a3b6edef44
6
+ metadata.gz: 32882ac9c6ee8e890daa00cd7ebedb631b99ad3b57856378a49675586a4b125bef12dd4ab116087684969c600acd63300f9689af8ee223792362aecbbbb282fd
7
+ data.tar.gz: a01f7d306fbc08f3610ea0732540d0ee80c13703d36fb5d4c3919025df9ccca92707737dfc640016086e7a179437108ddb696144bc55faa789ed6c46aa7f1184
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- url_common (0.1.1)
4
+ url_common (0.1.2)
5
5
  fuzzyurl (~> 0.9.0)
6
6
  mechanize (~> 2.6)
7
7
 
@@ -1,3 +1,3 @@
1
1
  module UrlCommon
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
data/lib/url_common.rb CHANGED
@@ -39,9 +39,22 @@ module UrlCommon
39
39
  return 'us'
40
40
  end
41
41
 
42
+ # original
43
+ # def self.get_base_domain(url)
44
+ # parts = URI.parse(url)
45
+ # return parts.host.gsub(/^www./,'')
46
+ # end
47
+
42
48
  def self.get_base_domain(url)
43
- parts = URI.parse(url)
44
- return parts.host.gsub(/^www./,'')
49
+ #debugger if url =~ /c06rh22whx1g/
50
+ begin
51
+ url = url.gsub(/ /,'%20')
52
+ parts = URI.parse(url)
53
+ return parts.host.gsub(/^www./,'')
54
+ rescue StandardError => e
55
+ fu = Fuzzyurl.from_string(url)
56
+ return fu.hostname.gsub(/^www./,'')
57
+ end
45
58
  end
46
59
 
47
60
  def self.join(base, rest, debug = false)
@@ -60,9 +73,19 @@ module UrlCommon
60
73
  end
61
74
  end
62
75
 
63
- #TODO
64
76
  def self.count_links(html)
65
- return 0
77
+ if html =~ /<html/i
78
+ content_type = "html"
79
+ else
80
+ content_type = "ascii"
81
+ end
82
+ parts = html.split(" ")
83
+ link_ctr = 0
84
+ parts.each do |part|
85
+ link_ctr = link_ctr + 1 if part =~ /https:?\/\// && content_type == 'ascii'
86
+ link_ctr = link_ctr + 1 if part =~ /<a [^>]+.+<\/a>/i && content_type == 'html'
87
+ end
88
+ link_ctr
66
89
  end
67
90
 
68
91
  def self.agent
@@ -82,13 +105,23 @@ module UrlCommon
82
105
  #
83
106
  def self.url_base(url, base_domain=nil)
84
107
  if base_domain.nil?
85
- base_domain = get_base_domain(url)
108
+ base_domain = UrlCommon.get_base_domain(url)
109
+ end
110
+ begin
111
+ url = url.gsub(/ /,'%20')
112
+ parts = URI.parse(url)
113
+ extra = ""
114
+ extra = "?#{parts.query}" if parts.query
115
+ url_base = "#{base_domain}#{parts.path}#{extra}"
116
+ return url_base[0..254]
117
+ rescue StandardError => e
118
+ fu = Fuzzyurl.from_string(url)
119
+ base_domain = UrlCommon.get_base_domain(url)
120
+ extra = ""
121
+ extra = "?#{fu.query}" if fu.query
122
+ url_base = "#{base_domain}#{fu.path}#{extra}"
123
+ return url_base[0..254]
86
124
  end
87
- parts = URI.parse(url)
88
- extra = ""
89
- extra = "?#{parts.query}" if parts.query
90
- url_base = "#{base_domain}#{parts.path}#{extra}"
91
- return url_base[0..254]
92
125
  end
93
126
 
94
127
  #tested #https://www.amazon.com/gp/product/B01DT4A2R4/ref=as_li_qf_sp_asin_il_tl?ie=UTF8&tag=nickjanetakis-20&camp=1789&creative=9325&linkCode=as2&creativeASIN=B01DT4A2R4&linkId=496be5e222b6291369c0a393c797c2c0
@@ -262,8 +295,9 @@ module UrlCommon
262
295
  #TODO needs tests
263
296
  def self.create_mechanize_page_from_html(url, html)
264
297
  mechanize_page = Mechanize::Page.new(nil, {'content-type'=>'text/html'}, html, nil, Mechanize.new)
298
+ url = url.gsub(/ /,'%20')
265
299
  mechanize_page.uri = URI.parse(url)
266
-
300
+
267
301
  return mechanize_page
268
302
  end
269
303
 
@@ -279,6 +313,7 @@ module UrlCommon
279
313
  end
280
314
 
281
315
  #TODO needs tests
316
+ # UrlCommon.get_page_title("https://gist.github.com/fuzzygroup/811a9334b1a6dc394de74a23cb7e12fa")
282
317
  def self.get_page_title(url, html)
283
318
  page = UrlCommon.create_mechanize_page_from_html(url, html)
284
319
  title = ""
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_common
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Scott Johnson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-06-04 00:00:00.000000000 Z
11
+ date: 2022-06-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: fuzzyurl