url_common 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/Gemfile.lock +72 -0
- data/README.md +4 -2
- data/lib/url_common/version.rb +1 -1
- data/lib/url_common.rb +231 -0
- data/url_common.gemspec +4 -0
- metadata +32 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e75ba5e9d78301e4cf753f4ade0a54b036aaca17d7527791b984d8bc81215a5a
|
4
|
+
data.tar.gz: 7bae7598dd4884c9c795e511c32c02cca9095a32f23522e0241842631250488b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d633158a6b206eca935948a72ebcce4bf90a196c373eca02ae9895819f5b027333b60e4dc5ec84f7b6507d4f468ac1752d36ca367abae4b03b65fd6665857095
|
7
|
+
data.tar.gz: a48dc2eace4656f5b2e16957174ae399633d97ddcff830d799ce31dee94ebfcbe648a4e3a1eb78c0ea8a91f63d8df22df783eed7f99d21920a2ea6a3b6edef44
|
data/Gemfile
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
url_common (0.1.1)
|
5
|
+
fuzzyurl (~> 0.9.0)
|
6
|
+
mechanize (~> 2.6)
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: https://rubygems.org/
|
10
|
+
specs:
|
11
|
+
byebug (11.1.3)
|
12
|
+
connection_pool (2.2.3)
|
13
|
+
diff-lcs (1.4.4)
|
14
|
+
domain_name (0.5.20190701)
|
15
|
+
unf (>= 0.0.5, < 1.0.0)
|
16
|
+
fuzzyurl (0.9.0)
|
17
|
+
http-cookie (1.0.3)
|
18
|
+
domain_name (~> 0.5)
|
19
|
+
mechanize (2.7.6)
|
20
|
+
domain_name (~> 0.5, >= 0.5.1)
|
21
|
+
http-cookie (~> 1.0)
|
22
|
+
mime-types (>= 1.17.2)
|
23
|
+
net-http-digest_auth (~> 1.1, >= 1.1.1)
|
24
|
+
net-http-persistent (>= 2.5.2)
|
25
|
+
nokogiri (~> 1.6)
|
26
|
+
ntlm-http (~> 0.1, >= 0.1.1)
|
27
|
+
webrobots (>= 0.0.9, < 0.2)
|
28
|
+
mime-types (3.3.1)
|
29
|
+
mime-types-data (~> 3.2015)
|
30
|
+
mime-types-data (3.2020.0512)
|
31
|
+
mini_portile2 (2.4.0)
|
32
|
+
net-http-digest_auth (1.4.1)
|
33
|
+
net-http-persistent (4.0.0)
|
34
|
+
connection_pool (~> 2.2)
|
35
|
+
nokogiri (1.10.10)
|
36
|
+
mini_portile2 (~> 2.4.0)
|
37
|
+
ntlm-http (0.1.1)
|
38
|
+
rake (12.3.3)
|
39
|
+
rspec (3.9.0)
|
40
|
+
rspec-core (~> 3.9.0)
|
41
|
+
rspec-expectations (~> 3.9.0)
|
42
|
+
rspec-mocks (~> 3.9.0)
|
43
|
+
rspec-core (3.9.2)
|
44
|
+
rspec-support (~> 3.9.3)
|
45
|
+
rspec-expectations (3.9.2)
|
46
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
47
|
+
rspec-support (~> 3.9.0)
|
48
|
+
rspec-mocks (3.9.1)
|
49
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
50
|
+
rspec-support (~> 3.9.0)
|
51
|
+
rspec-support (3.9.3)
|
52
|
+
unf (0.1.4)
|
53
|
+
unf_ext
|
54
|
+
unf_ext (0.0.7.7)
|
55
|
+
webrobots (0.1.2)
|
56
|
+
|
57
|
+
PLATFORMS
|
58
|
+
ruby
|
59
|
+
|
60
|
+
DEPENDENCIES
|
61
|
+
byebug
|
62
|
+
fuzzyurl (~> 0.9.0)
|
63
|
+
mechanize (~> 2.6)
|
64
|
+
rake (~> 12.0)
|
65
|
+
rspec (~> 3.0)
|
66
|
+
url_common!
|
67
|
+
|
68
|
+
RUBY VERSION
|
69
|
+
ruby 2.7.1p83
|
70
|
+
|
71
|
+
BUNDLED WITH
|
72
|
+
2.1.4
|
data/README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# UrlCommon
|
2
2
|
|
3
|
-
|
3
|
+
This is a gem for performing common Url centric things. I wrote this years ago and have always just moved it from project to project leading to a huge number of different versions on my development system. Finally I'm creating a gem out of it to aid in its use across projects.
|
4
4
|
|
5
|
-
|
5
|
+
I don't claim that these are great, perfect, etc. I claim that they are workman like tools which I FIND USEFUL and I want to use them more easily across multiple projects hence the open sourcing of them.
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
@@ -24,6 +24,8 @@ Or install it yourself as:
|
|
24
24
|
|
25
25
|
TODO: Write usage instructions here
|
26
26
|
|
27
|
+
This is a todo.
|
28
|
+
|
27
29
|
## Development
|
28
30
|
|
29
31
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/lib/url_common/version.rb
CHANGED
data/lib/url_common.rb
CHANGED
@@ -266,4 +266,235 @@ module UrlCommon
|
|
266
266
|
|
267
267
|
return mechanize_page
|
268
268
|
end
|
269
|
+
|
270
|
+
#TODO needs tests
|
271
|
+
def self.get_meta_description(url, html)
|
272
|
+
page = UrlCommon.create_mechanize_page_from_html(url, html)
|
273
|
+
description = ""
|
274
|
+
begin
|
275
|
+
description = page.parser.at("meta[name='description']")['content']
|
276
|
+
rescue StandardError => e
|
277
|
+
end
|
278
|
+
return description
|
279
|
+
end
|
280
|
+
|
281
|
+
#TODO needs tests
|
282
|
+
def self.get_page_title(url, html)
|
283
|
+
page = UrlCommon.create_mechanize_page_from_html(url, html)
|
284
|
+
title = ""
|
285
|
+
begin
|
286
|
+
title = page.parser.css('title').first.content
|
287
|
+
rescue StandardError => e
|
288
|
+
end
|
289
|
+
return title
|
290
|
+
end
|
291
|
+
|
292
|
+
def self.extract_links_from_text(text)
|
293
|
+
agent = Mechanize.new
|
294
|
+
html = "<HTML><BODY>#{text}</BODY></HTML>"
|
295
|
+
page = Mechanize::Page.new(nil,{'content-type'=>'text/html'},html,nil,agent)
|
296
|
+
return page.links
|
297
|
+
end
|
298
|
+
|
299
|
+
# https://docs.aylien.com/textapi/#using-the-api
|
300
|
+
def self.summarize_url(url)
|
301
|
+
#GET /summarize?url=http://www.bbc.com/sport/0/football/25912393
|
302
|
+
agent = Mechanize.new
|
303
|
+
summarization_url = ""
|
304
|
+
page = agent.get(url)
|
305
|
+
end
|
306
|
+
|
307
|
+
# fucking idiotic test case for this fucking idiot is: https://devslopes.com/
|
308
|
+
def self.test_random_url(url_or_host)
|
309
|
+
random_filename = TextCommon.sha(Time.now.to_s) + ".xml"
|
310
|
+
if url_or_host =~ /http/
|
311
|
+
url = File.join(url_or_host, random_filename)
|
312
|
+
else
|
313
|
+
url = File.join("http://", host, random_filename)
|
314
|
+
end
|
315
|
+
status, url = UrlCommon.check_for_404(url, true)
|
316
|
+
#
|
317
|
+
# Key bit of logic -- if we get a return value for a randomized sha then that means that
|
318
|
+
# a) the destination site owner is a fucking moron
|
319
|
+
# b) that the destination site owner has set his site so it NEVER returns a 404
|
320
|
+
# c) they're a fucking moron
|
321
|
+
# d) if I get a 200 back then it means that they return you to the home page for anything and NOT
|
322
|
+
# a proper 404 so need to flip flop the logic and return error on a 200; sheesh
|
323
|
+
#
|
324
|
+
return :error, url if status == :ok
|
325
|
+
return :ok, url
|
326
|
+
end
|
327
|
+
|
328
|
+
def self.select_best_rssurl_from_rssurls(urls)
|
329
|
+
return urls.sort_by(&:length).first
|
330
|
+
end
|
331
|
+
|
332
|
+
def self.possible_rssurls(site_url, skip_slash_blog = false)
|
333
|
+
# urls we will probe
|
334
|
+
possible_rssurl_formats = []
|
335
|
+
|
336
|
+
# normal baselines
|
337
|
+
possible_rssurl_formats << "feed.xml"
|
338
|
+
possible_rssurl_formats << "rss.xml"
|
339
|
+
possible_rssurl_formats << "atom.xml"
|
340
|
+
possible_rssurl_formats << "feed/"
|
341
|
+
|
342
|
+
# optionally look at /blog/
|
343
|
+
possible_rssurl_formats << "/blog/feed.xml"
|
344
|
+
possible_rssurl_formats << "/blog/rss.xml"
|
345
|
+
possible_rssurl_formats << "/blog/atom.xml"
|
346
|
+
possible_rssurl_formats << "/blog/feed/"
|
347
|
+
|
348
|
+
possible_rssurls = []
|
349
|
+
possible_rssurl_formats.each do |url_format|
|
350
|
+
possible_rssurls << UrlCommon.join(site_url, url_format)
|
351
|
+
end
|
352
|
+
|
353
|
+
return possible_rssurls
|
354
|
+
end
|
355
|
+
|
356
|
+
def self.parse_html_for_rssurl_from_head(site_url, page = nil, debug = false)
|
357
|
+
if page
|
358
|
+
status = :ok
|
359
|
+
else
|
360
|
+
status, page = UrlCommon.get_page(site_url)
|
361
|
+
end
|
362
|
+
puts "Into html parse for rssurl" if debug
|
363
|
+
possibles = []
|
364
|
+
if status == :ok && page
|
365
|
+
#results = page.css("link[rel='alternate']")
|
366
|
+
results = page.css("link[rel='alternate'][type='application/rss+xml']")
|
367
|
+
#
|
368
|
+
# If only a single one then return it
|
369
|
+
#
|
370
|
+
#return results.first['href'] if results.first['type'] =~ /application\/rss\+xml/i && results.size == 1
|
371
|
+
return results.first['href'] if results.size == 1
|
372
|
+
|
373
|
+
#
|
374
|
+
# If an array then filter out the comments
|
375
|
+
#
|
376
|
+
results.each do |result|
|
377
|
+
possibles << result unless result['title'] =~ /comments? feed/i
|
378
|
+
end
|
379
|
+
|
380
|
+
#
|
381
|
+
# Loop over the possibles and just return the shortest url
|
382
|
+
#
|
383
|
+
# Todo -- can likely do a better job on this
|
384
|
+
#
|
385
|
+
urls = []
|
386
|
+
possibles.each do |possible|
|
387
|
+
urls << possible['href']
|
388
|
+
end
|
389
|
+
return UrlCommon.select_best_rssurl_from_rssurls(urls)
|
390
|
+
#return urls.sort_by(&:length).first
|
391
|
+
|
392
|
+
|
393
|
+
# results.each do |result|
|
394
|
+
#
|
395
|
+
# end
|
396
|
+
# end
|
397
|
+
# doc = Nokogiri::HTML(page.body)
|
398
|
+
# results << doc.at('link[rel="alternate"]')
|
399
|
+
# results = results.flatten
|
400
|
+
end
|
401
|
+
end
|
402
|
+
|
403
|
+
def self.get_protocol(url)
|
404
|
+
parts = url.to_s.split(":")
|
405
|
+
return parts.first
|
406
|
+
end
|
407
|
+
|
408
|
+
#https://500hats.com/feed
|
409
|
+
# UrlCommon.discover_feed_url("https://nickjanetakis.com")
|
410
|
+
def self.discover_feed_url(site_url, debug = false)
|
411
|
+
# step 1: remove the file from the site_url if it has one
|
412
|
+
# step 2: problem the common ones and 404 check
|
413
|
+
|
414
|
+
#
|
415
|
+
# Build a set of possibles
|
416
|
+
#
|
417
|
+
possible_rssurls = UrlCommon.possible_rssurls(site_url)
|
418
|
+
|
419
|
+
#
|
420
|
+
# Keep track of failures
|
421
|
+
#
|
422
|
+
failed_probes = Set.new
|
423
|
+
|
424
|
+
# step 3: parse the html
|
425
|
+
#<link rel="alternate" type="application/rss+xml" href="http://scripting.com/rss.xml" />
|
426
|
+
#<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg » Feed" href="https://ma.tt/feed/" />
|
427
|
+
#<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg » Comments Feed" href="https://ma.tt/comments/feed/" />
|
428
|
+
|
429
|
+
#
|
430
|
+
# Stage 1 -- do http head probing
|
431
|
+
#
|
432
|
+
possible_rssurls.each do |rssurl|
|
433
|
+
puts "Head Probing for: #{rssurl}" if debug
|
434
|
+
|
435
|
+
# abort if we doubled blog i.e. /blog/blog/ in the url
|
436
|
+
next if rssurl =~ /blog\/blog/
|
437
|
+
next if failed_probes.include?(rssurl)
|
438
|
+
|
439
|
+
status, url = UrlCommon.check_for_404(rssurl, true)
|
440
|
+
random_status, random_url = UrlCommon.test_random_url(site_url)
|
441
|
+
#debugger
|
442
|
+
return rssurl if status == :ok && random_status == :ok
|
443
|
+
failed_probes << rssurl
|
444
|
+
end
|
445
|
+
|
446
|
+
puts "After probe, failed_probes as: #{failed_probes.inspect}"
|
447
|
+
|
448
|
+
#
|
449
|
+
# Stage 2-- if subdirectory go up one level and probe again
|
450
|
+
#
|
451
|
+
# TODO
|
452
|
+
|
453
|
+
|
454
|
+
|
455
|
+
#
|
456
|
+
# Stage 3 -- Goto root and probe again
|
457
|
+
#
|
458
|
+
#test for this is the nick site
|
459
|
+
fuzzy_url_parts = Fuzzyurl.new(site_url)
|
460
|
+
base_url = "#{fuzzy_url_parts.protocol}://#{fuzzy_url_parts.hostname}"
|
461
|
+
possible_rssurls = UrlCommon.possible_rssurls(base_url)
|
462
|
+
#debugger
|
463
|
+
possible_rssurls.each do |rssurl|
|
464
|
+
puts "Head Probing for: #{rssurl} at site root stage" #if debug
|
465
|
+
|
466
|
+
# abort if we doubled blog i.e. /blog/blog/ in the url
|
467
|
+
next if rssurl =~ /blog\/blog/
|
468
|
+
next if failed_probes.include?(rssurl)
|
469
|
+
|
470
|
+
status, url = UrlCommon.check_for_404(rssurl, true)
|
471
|
+
return rssurl if status == :ok
|
472
|
+
failed_probes << rssurl
|
473
|
+
end
|
474
|
+
|
475
|
+
|
476
|
+
#
|
477
|
+
# Stage 4 - parse the html
|
478
|
+
#
|
479
|
+
rssurl = UrlCommon.parse_html_for_rssurl_from_head(site_url, nil, true)
|
480
|
+
return rssurl if rssurl
|
481
|
+
|
482
|
+
#
|
483
|
+
# Stage 5 - fall over to feedback
|
484
|
+
#
|
485
|
+
results = Feedbag.find(site_url)
|
486
|
+
# checked_results = []
|
487
|
+
# results.each do |result|
|
488
|
+
# struct = UrlCommon.check_for_404(result)
|
489
|
+
# checked_results << result if struct.status == 200
|
490
|
+
# end
|
491
|
+
|
492
|
+
#
|
493
|
+
# Stage 6 - cache failures to redis so don't look for them again
|
494
|
+
#
|
495
|
+
#$redis.
|
496
|
+
|
497
|
+
return UrlCommon.select_best_rssurl_from_rssurls(results)
|
498
|
+
end
|
499
|
+
|
269
500
|
end
|
data/url_common.gemspec
CHANGED
@@ -26,4 +26,8 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.bindir = "exe"
|
27
27
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
28
28
|
spec.require_paths = ["lib"]
|
29
|
+
|
30
|
+
spec.add_dependency 'fuzzyurl', '~> 0.9.0'
|
31
|
+
spec.add_dependency 'mechanize', '~> 2.6'
|
32
|
+
|
29
33
|
end
|
metadata
CHANGED
@@ -1,15 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_common
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Scott Johnson
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
12
|
-
dependencies:
|
11
|
+
date: 2022-06-04 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: fuzzyurl
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.9.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.9.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: mechanize
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.6'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.6'
|
13
41
|
description: This is a class library for common url manipulation and crawling tasks. It
|
14
42
|
is based on a career focused on the practical side of working with the Internet
|
15
43
|
using Ruby.
|
@@ -24,6 +52,7 @@ files:
|
|
24
52
|
- ".travis.yml"
|
25
53
|
- CODE_OF_CONDUCT.md
|
26
54
|
- Gemfile
|
55
|
+
- Gemfile.lock
|
27
56
|
- LICENSE.txt
|
28
57
|
- README.md
|
29
58
|
- Rakefile
|