url_common 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/Gemfile.lock +72 -0
- data/README.md +4 -2
- data/lib/url_common/version.rb +1 -1
- data/lib/url_common.rb +231 -0
- data/url_common.gemspec +4 -0
- metadata +32 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e75ba5e9d78301e4cf753f4ade0a54b036aaca17d7527791b984d8bc81215a5a
|
4
|
+
data.tar.gz: 7bae7598dd4884c9c795e511c32c02cca9095a32f23522e0241842631250488b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d633158a6b206eca935948a72ebcce4bf90a196c373eca02ae9895819f5b027333b60e4dc5ec84f7b6507d4f468ac1752d36ca367abae4b03b65fd6665857095
|
7
|
+
data.tar.gz: a48dc2eace4656f5b2e16957174ae399633d97ddcff830d799ce31dee94ebfcbe648a4e3a1eb78c0ea8a91f63d8df22df783eed7f99d21920a2ea6a3b6edef44
|
data/Gemfile
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
url_common (0.1.1)
|
5
|
+
fuzzyurl (~> 0.9.0)
|
6
|
+
mechanize (~> 2.6)
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: https://rubygems.org/
|
10
|
+
specs:
|
11
|
+
byebug (11.1.3)
|
12
|
+
connection_pool (2.2.3)
|
13
|
+
diff-lcs (1.4.4)
|
14
|
+
domain_name (0.5.20190701)
|
15
|
+
unf (>= 0.0.5, < 1.0.0)
|
16
|
+
fuzzyurl (0.9.0)
|
17
|
+
http-cookie (1.0.3)
|
18
|
+
domain_name (~> 0.5)
|
19
|
+
mechanize (2.7.6)
|
20
|
+
domain_name (~> 0.5, >= 0.5.1)
|
21
|
+
http-cookie (~> 1.0)
|
22
|
+
mime-types (>= 1.17.2)
|
23
|
+
net-http-digest_auth (~> 1.1, >= 1.1.1)
|
24
|
+
net-http-persistent (>= 2.5.2)
|
25
|
+
nokogiri (~> 1.6)
|
26
|
+
ntlm-http (~> 0.1, >= 0.1.1)
|
27
|
+
webrobots (>= 0.0.9, < 0.2)
|
28
|
+
mime-types (3.3.1)
|
29
|
+
mime-types-data (~> 3.2015)
|
30
|
+
mime-types-data (3.2020.0512)
|
31
|
+
mini_portile2 (2.4.0)
|
32
|
+
net-http-digest_auth (1.4.1)
|
33
|
+
net-http-persistent (4.0.0)
|
34
|
+
connection_pool (~> 2.2)
|
35
|
+
nokogiri (1.10.10)
|
36
|
+
mini_portile2 (~> 2.4.0)
|
37
|
+
ntlm-http (0.1.1)
|
38
|
+
rake (12.3.3)
|
39
|
+
rspec (3.9.0)
|
40
|
+
rspec-core (~> 3.9.0)
|
41
|
+
rspec-expectations (~> 3.9.0)
|
42
|
+
rspec-mocks (~> 3.9.0)
|
43
|
+
rspec-core (3.9.2)
|
44
|
+
rspec-support (~> 3.9.3)
|
45
|
+
rspec-expectations (3.9.2)
|
46
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
47
|
+
rspec-support (~> 3.9.0)
|
48
|
+
rspec-mocks (3.9.1)
|
49
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
50
|
+
rspec-support (~> 3.9.0)
|
51
|
+
rspec-support (3.9.3)
|
52
|
+
unf (0.1.4)
|
53
|
+
unf_ext
|
54
|
+
unf_ext (0.0.7.7)
|
55
|
+
webrobots (0.1.2)
|
56
|
+
|
57
|
+
PLATFORMS
|
58
|
+
ruby
|
59
|
+
|
60
|
+
DEPENDENCIES
|
61
|
+
byebug
|
62
|
+
fuzzyurl (~> 0.9.0)
|
63
|
+
mechanize (~> 2.6)
|
64
|
+
rake (~> 12.0)
|
65
|
+
rspec (~> 3.0)
|
66
|
+
url_common!
|
67
|
+
|
68
|
+
RUBY VERSION
|
69
|
+
ruby 2.7.1p83
|
70
|
+
|
71
|
+
BUNDLED WITH
|
72
|
+
2.1.4
|
data/README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# UrlCommon
|
2
2
|
|
3
|
-
|
3
|
+
This is a gem for performing common Url centric things. I wrote this years ago and have always just moved it from project to project leading to a huge number of different versions on my development system. Finally I'm creating a gem out of it to aid in its use across projects.
|
4
4
|
|
5
|
-
|
5
|
+
I don't claim that these are great, perfect, etc. I claim that they are workman like tools which I FIND USEFUL and I want to use them more easily across multiple projects hence the open sourcing of them.
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
@@ -24,6 +24,8 @@ Or install it yourself as:
|
|
24
24
|
|
25
25
|
TODO: Write usage instructions here
|
26
26
|
|
27
|
+
This is a todo.
|
28
|
+
|
27
29
|
## Development
|
28
30
|
|
29
31
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/lib/url_common/version.rb
CHANGED
data/lib/url_common.rb
CHANGED
@@ -266,4 +266,235 @@ module UrlCommon
|
|
266
266
|
|
267
267
|
return mechanize_page
|
268
268
|
end
|
269
|
+
|
270
|
+
#TODO needs tests
|
271
|
+
def self.get_meta_description(url, html)
|
272
|
+
page = UrlCommon.create_mechanize_page_from_html(url, html)
|
273
|
+
description = ""
|
274
|
+
begin
|
275
|
+
description = page.parser.at("meta[name='description']")['content']
|
276
|
+
rescue StandardError => e
|
277
|
+
end
|
278
|
+
return description
|
279
|
+
end
|
280
|
+
|
281
|
+
#TODO needs tests
|
282
|
+
def self.get_page_title(url, html)
|
283
|
+
page = UrlCommon.create_mechanize_page_from_html(url, html)
|
284
|
+
title = ""
|
285
|
+
begin
|
286
|
+
title = page.parser.css('title').first.content
|
287
|
+
rescue StandardError => e
|
288
|
+
end
|
289
|
+
return title
|
290
|
+
end
|
291
|
+
|
292
|
+
def self.extract_links_from_text(text)
|
293
|
+
agent = Mechanize.new
|
294
|
+
html = "<HTML><BODY>#{text}</BODY></HTML>"
|
295
|
+
page = Mechanize::Page.new(nil,{'content-type'=>'text/html'},html,nil,agent)
|
296
|
+
return page.links
|
297
|
+
end
|
298
|
+
|
299
|
+
# https://docs.aylien.com/textapi/#using-the-api
|
300
|
+
def self.summarize_url(url)
|
301
|
+
#GET /summarize?url=http://www.bbc.com/sport/0/football/25912393
|
302
|
+
agent = Mechanize.new
|
303
|
+
summarization_url = ""
|
304
|
+
page = agent.get(url)
|
305
|
+
end
|
306
|
+
|
307
|
+
# fucking idiotic test case for this fucking idiot is: https://devslopes.com/
|
308
|
+
def self.test_random_url(url_or_host)
|
309
|
+
random_filename = TextCommon.sha(Time.now.to_s) + ".xml"
|
310
|
+
if url_or_host =~ /http/
|
311
|
+
url = File.join(url_or_host, random_filename)
|
312
|
+
else
|
313
|
+
url = File.join("http://", host, random_filename)
|
314
|
+
end
|
315
|
+
status, url = UrlCommon.check_for_404(url, true)
|
316
|
+
#
|
317
|
+
# Key bit of logic -- if we get a return value for a randomized sha then that means that
|
318
|
+
# a) the destination site owner is a fucking moron
|
319
|
+
# b) that the destination site owner has set his site so it NEVER returns a 404
|
320
|
+
# c) they're a fucking moron
|
321
|
+
# d) if I get a 200 back then it means that they return you to the home page for anything and NOT
|
322
|
+
# a proper 404 so need to flip flop the logic and return error on a 200; sheesh
|
323
|
+
#
|
324
|
+
return :error, url if status == :ok
|
325
|
+
return :ok, url
|
326
|
+
end
|
327
|
+
|
328
|
+
def self.select_best_rssurl_from_rssurls(urls)
|
329
|
+
return urls.sort_by(&:length).first
|
330
|
+
end
|
331
|
+
|
332
|
+
def self.possible_rssurls(site_url, skip_slash_blog = false)
|
333
|
+
# urls we will probe
|
334
|
+
possible_rssurl_formats = []
|
335
|
+
|
336
|
+
# normal baselines
|
337
|
+
possible_rssurl_formats << "feed.xml"
|
338
|
+
possible_rssurl_formats << "rss.xml"
|
339
|
+
possible_rssurl_formats << "atom.xml"
|
340
|
+
possible_rssurl_formats << "feed/"
|
341
|
+
|
342
|
+
# optionally look at /blog/
|
343
|
+
possible_rssurl_formats << "/blog/feed.xml"
|
344
|
+
possible_rssurl_formats << "/blog/rss.xml"
|
345
|
+
possible_rssurl_formats << "/blog/atom.xml"
|
346
|
+
possible_rssurl_formats << "/blog/feed/"
|
347
|
+
|
348
|
+
possible_rssurls = []
|
349
|
+
possible_rssurl_formats.each do |url_format|
|
350
|
+
possible_rssurls << UrlCommon.join(site_url, url_format)
|
351
|
+
end
|
352
|
+
|
353
|
+
return possible_rssurls
|
354
|
+
end
|
355
|
+
|
356
|
+
def self.parse_html_for_rssurl_from_head(site_url, page = nil, debug = false)
|
357
|
+
if page
|
358
|
+
status = :ok
|
359
|
+
else
|
360
|
+
status, page = UrlCommon.get_page(site_url)
|
361
|
+
end
|
362
|
+
puts "Into html parse for rssurl" if debug
|
363
|
+
possibles = []
|
364
|
+
if status == :ok && page
|
365
|
+
#results = page.css("link[rel='alternate']")
|
366
|
+
results = page.css("link[rel='alternate'][type='application/rss+xml']")
|
367
|
+
#
|
368
|
+
# If only a single one then return it
|
369
|
+
#
|
370
|
+
#return results.first['href'] if results.first['type'] =~ /application\/rss\+xml/i && results.size == 1
|
371
|
+
return results.first['href'] if results.size == 1
|
372
|
+
|
373
|
+
#
|
374
|
+
# If an array then filter out the comments
|
375
|
+
#
|
376
|
+
results.each do |result|
|
377
|
+
possibles << result unless result['title'] =~ /comments? feed/i
|
378
|
+
end
|
379
|
+
|
380
|
+
#
|
381
|
+
# Loop over the possibles and just return the shortest url
|
382
|
+
#
|
383
|
+
# Todo -- can likely do a better job on this
|
384
|
+
#
|
385
|
+
urls = []
|
386
|
+
possibles.each do |possible|
|
387
|
+
urls << possible['href']
|
388
|
+
end
|
389
|
+
return UrlCommon.select_best_rssurl_from_rssurls(urls)
|
390
|
+
#return urls.sort_by(&:length).first
|
391
|
+
|
392
|
+
|
393
|
+
# results.each do |result|
|
394
|
+
#
|
395
|
+
# end
|
396
|
+
# end
|
397
|
+
# doc = Nokogiri::HTML(page.body)
|
398
|
+
# results << doc.at('link[rel="alternate"]')
|
399
|
+
# results = results.flatten
|
400
|
+
end
|
401
|
+
end
|
402
|
+
|
403
|
+
def self.get_protocol(url)
|
404
|
+
parts = url.to_s.split(":")
|
405
|
+
return parts.first
|
406
|
+
end
|
407
|
+
|
408
|
+
#https://500hats.com/feed
|
409
|
+
# UrlCommon.discover_feed_url("https://nickjanetakis.com")
|
410
|
+
def self.discover_feed_url(site_url, debug = false)
|
411
|
+
# step 1: remove the file from the site_url if it has one
|
412
|
+
# step 2: problem the common ones and 404 check
|
413
|
+
|
414
|
+
#
|
415
|
+
# Build a set of possibles
|
416
|
+
#
|
417
|
+
possible_rssurls = UrlCommon.possible_rssurls(site_url)
|
418
|
+
|
419
|
+
#
|
420
|
+
# Keep track of failures
|
421
|
+
#
|
422
|
+
failed_probes = Set.new
|
423
|
+
|
424
|
+
# step 3: parse the html
|
425
|
+
#<link rel="alternate" type="application/rss+xml" href="http://scripting.com/rss.xml" />
|
426
|
+
#<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg » Feed" href="https://ma.tt/feed/" />
|
427
|
+
#<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg » Comments Feed" href="https://ma.tt/comments/feed/" />
|
428
|
+
|
429
|
+
#
|
430
|
+
# Stage 1 -- do http head probing
|
431
|
+
#
|
432
|
+
possible_rssurls.each do |rssurl|
|
433
|
+
puts "Head Probing for: #{rssurl}" if debug
|
434
|
+
|
435
|
+
# abort if we doubled blog i.e. /blog/blog/ in the url
|
436
|
+
next if rssurl =~ /blog\/blog/
|
437
|
+
next if failed_probes.include?(rssurl)
|
438
|
+
|
439
|
+
status, url = UrlCommon.check_for_404(rssurl, true)
|
440
|
+
random_status, random_url = UrlCommon.test_random_url(site_url)
|
441
|
+
#debugger
|
442
|
+
return rssurl if status == :ok && random_status == :ok
|
443
|
+
failed_probes << rssurl
|
444
|
+
end
|
445
|
+
|
446
|
+
puts "After probe, failed_probes as: #{failed_probes.inspect}"
|
447
|
+
|
448
|
+
#
|
449
|
+
# Stage 2-- if subdirectory go up one level and probe again
|
450
|
+
#
|
451
|
+
# TODO
|
452
|
+
|
453
|
+
|
454
|
+
|
455
|
+
#
|
456
|
+
# Stage 3 -- Goto root and probe again
|
457
|
+
#
|
458
|
+
#test for this is the nick site
|
459
|
+
fuzzy_url_parts = Fuzzyurl.new(site_url)
|
460
|
+
base_url = "#{fuzzy_url_parts.protocol}://#{fuzzy_url_parts.hostname}"
|
461
|
+
possible_rssurls = UrlCommon.possible_rssurls(base_url)
|
462
|
+
#debugger
|
463
|
+
possible_rssurls.each do |rssurl|
|
464
|
+
puts "Head Probing for: #{rssurl} at site root stage" #if debug
|
465
|
+
|
466
|
+
# abort if we doubled blog i.e. /blog/blog/ in the url
|
467
|
+
next if rssurl =~ /blog\/blog/
|
468
|
+
next if failed_probes.include?(rssurl)
|
469
|
+
|
470
|
+
status, url = UrlCommon.check_for_404(rssurl, true)
|
471
|
+
return rssurl if status == :ok
|
472
|
+
failed_probes << rssurl
|
473
|
+
end
|
474
|
+
|
475
|
+
|
476
|
+
#
|
477
|
+
# Stage 4 - parse the html
|
478
|
+
#
|
479
|
+
rssurl = UrlCommon.parse_html_for_rssurl_from_head(site_url, nil, true)
|
480
|
+
return rssurl if rssurl
|
481
|
+
|
482
|
+
#
|
483
|
+
# Stage 5 - fall over to feedback
|
484
|
+
#
|
485
|
+
results = Feedbag.find(site_url)
|
486
|
+
# checked_results = []
|
487
|
+
# results.each do |result|
|
488
|
+
# struct = UrlCommon.check_for_404(result)
|
489
|
+
# checked_results << result if struct.status == 200
|
490
|
+
# end
|
491
|
+
|
492
|
+
#
|
493
|
+
# Stage 6 - cache failures to redis so don't look for them again
|
494
|
+
#
|
495
|
+
#$redis.
|
496
|
+
|
497
|
+
return UrlCommon.select_best_rssurl_from_rssurls(results)
|
498
|
+
end
|
499
|
+
|
269
500
|
end
|
data/url_common.gemspec
CHANGED
@@ -26,4 +26,8 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.bindir = "exe"
|
27
27
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
28
28
|
spec.require_paths = ["lib"]
|
29
|
+
|
30
|
+
spec.add_dependency 'fuzzyurl', '~> 0.9.0'
|
31
|
+
spec.add_dependency 'mechanize', '~> 2.6'
|
32
|
+
|
29
33
|
end
|
metadata
CHANGED
@@ -1,15 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_common
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Scott Johnson
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
12
|
-
dependencies:
|
11
|
+
date: 2022-06-04 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: fuzzyurl
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.9.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.9.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: mechanize
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.6'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.6'
|
13
41
|
description: This is a class library for common url manipulation and crawling tasks. It
|
14
42
|
is based on a career focused on the practical side of working with the Internet
|
15
43
|
using Ruby.
|
@@ -24,6 +52,7 @@ files:
|
|
24
52
|
- ".travis.yml"
|
25
53
|
- CODE_OF_CONDUCT.md
|
26
54
|
- Gemfile
|
55
|
+
- Gemfile.lock
|
27
56
|
- LICENSE.txt
|
28
57
|
- README.md
|
29
58
|
- Rakefile
|