url_common 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e75ba5e9d78301e4cf753f4ade0a54b036aaca17d7527791b984d8bc81215a5a
4
- data.tar.gz: 7bae7598dd4884c9c795e511c32c02cca9095a32f23522e0241842631250488b
3
+ metadata.gz: b6ae6f1bbc3ec4b834e05a97701a13746db44ad172869868c962067599c12cc2
4
+ data.tar.gz: 3194ad31952c2bfc4fc5730ad766104f96a0299b3000a99a6efd32224907adfc
5
5
  SHA512:
6
- metadata.gz: d633158a6b206eca935948a72ebcce4bf90a196c373eca02ae9895819f5b027333b60e4dc5ec84f7b6507d4f468ac1752d36ca367abae4b03b65fd6665857095
7
- data.tar.gz: a48dc2eace4656f5b2e16957174ae399633d97ddcff830d799ce31dee94ebfcbe648a4e3a1eb78c0ea8a91f63d8df22df783eed7f99d21920a2ea6a3b6edef44
6
+ metadata.gz: 32882ac9c6ee8e890daa00cd7ebedb631b99ad3b57856378a49675586a4b125bef12dd4ab116087684969c600acd63300f9689af8ee223792362aecbbbb282fd
7
+ data.tar.gz: a01f7d306fbc08f3610ea0732540d0ee80c13703d36fb5d4c3919025df9ccca92707737dfc640016086e7a179437108ddb696144bc55faa789ed6c46aa7f1184
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- url_common (0.1.1)
4
+ url_common (0.1.2)
5
5
  fuzzyurl (~> 0.9.0)
6
6
  mechanize (~> 2.6)
7
7
 
@@ -1,3 +1,3 @@
1
1
  module UrlCommon
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
data/lib/url_common.rb CHANGED
@@ -39,9 +39,22 @@ module UrlCommon
39
39
  return 'us'
40
40
  end
41
41
 
42
+ # original
43
+ # def self.get_base_domain(url)
44
+ # parts = URI.parse(url)
45
+ # return parts.host.gsub(/^www./,'')
46
+ # end
47
+
42
48
  def self.get_base_domain(url)
43
- parts = URI.parse(url)
44
- return parts.host.gsub(/^www./,'')
49
+ #debugger if url =~ /c06rh22whx1g/
50
+ begin
51
+ url = url.gsub(/ /,'%20')
52
+ parts = URI.parse(url)
53
+ return parts.host.gsub(/^www./,'')
54
+ rescue StandardError => e
55
+ fu = Fuzzyurl.from_string(url)
56
+ return fu.hostname.gsub(/^www./,'')
57
+ end
45
58
  end
46
59
 
47
60
  def self.join(base, rest, debug = false)
@@ -60,9 +73,19 @@ module UrlCommon
60
73
  end
61
74
  end
62
75
 
63
- #TODO
64
76
  def self.count_links(html)
65
- return 0
77
+ if html =~ /<html/i
78
+ content_type = "html"
79
+ else
80
+ content_type = "ascii"
81
+ end
82
+ parts = html.split(" ")
83
+ link_ctr = 0
84
+ parts.each do |part|
85
+ link_ctr = link_ctr + 1 if part =~ /https:?\/\// && content_type == 'ascii'
86
+ link_ctr = link_ctr + 1 if part =~ /<a [^>]+.+<\/a>/i && content_type == 'html'
87
+ end
88
+ link_ctr
66
89
  end
67
90
 
68
91
  def self.agent
@@ -82,13 +105,23 @@ module UrlCommon
82
105
  #
83
106
  def self.url_base(url, base_domain=nil)
84
107
  if base_domain.nil?
85
- base_domain = get_base_domain(url)
108
+ base_domain = UrlCommon.get_base_domain(url)
109
+ end
110
+ begin
111
+ url = url.gsub(/ /,'%20')
112
+ parts = URI.parse(url)
113
+ extra = ""
114
+ extra = "?#{parts.query}" if parts.query
115
+ url_base = "#{base_domain}#{parts.path}#{extra}"
116
+ return url_base[0..254]
117
+ rescue StandardError => e
118
+ fu = Fuzzyurl.from_string(url)
119
+ base_domain = UrlCommon.get_base_domain(url)
120
+ extra = ""
121
+ extra = "?#{fu.query}" if fu.query
122
+ url_base = "#{base_domain}#{fu.path}#{extra}"
123
+ return url_base[0..254]
86
124
  end
87
- parts = URI.parse(url)
88
- extra = ""
89
- extra = "?#{parts.query}" if parts.query
90
- url_base = "#{base_domain}#{parts.path}#{extra}"
91
- return url_base[0..254]
92
125
  end
93
126
 
94
127
  #tested #https://www.amazon.com/gp/product/B01DT4A2R4/ref=as_li_qf_sp_asin_il_tl?ie=UTF8&tag=nickjanetakis-20&camp=1789&creative=9325&linkCode=as2&creativeASIN=B01DT4A2R4&linkId=496be5e222b6291369c0a393c797c2c0
@@ -262,8 +295,9 @@ module UrlCommon
262
295
  #TODO needs tests
263
296
  def self.create_mechanize_page_from_html(url, html)
264
297
  mechanize_page = Mechanize::Page.new(nil, {'content-type'=>'text/html'}, html, nil, Mechanize.new)
298
+ url = url.gsub(/ /,'%20')
265
299
  mechanize_page.uri = URI.parse(url)
266
-
300
+
267
301
  return mechanize_page
268
302
  end
269
303
 
@@ -279,6 +313,7 @@ module UrlCommon
279
313
  end
280
314
 
281
315
  #TODO needs tests
316
+ # UrlCommon.get_page_title("https://gist.github.com/fuzzygroup/811a9334b1a6dc394de74a23cb7e12fa")
282
317
  def self.get_page_title(url, html)
283
318
  page = UrlCommon.create_mechanize_page_from_html(url, html)
284
319
  title = ""
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_common
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Scott Johnson
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-06-04 00:00:00.000000000 Z
11
+ date: 2022-06-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: fuzzyurl