wayback_archiver 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 66f995653260c6d884271708691177c6c4e1c5b2
4
- data.tar.gz: b46518ada2f179c8034bb48c41f17195e5f4551d
3
+ metadata.gz: 87e286b40825e15468832af25c42c98b59170b21
4
+ data.tar.gz: 9a7eb7e6fc6db882cf23878718626db04847b33a
5
5
  SHA512:
6
- metadata.gz: 5837775a0b99935895992589a75490859e4b3c2b089ee20e3d339e52913cfc56c1e5e055bd4650bb5678b93abf76f2d7204a27d67ea1e9403c69f20cdab1d422
7
- data.tar.gz: 5d256b29a98bfba85dad1559075885fa0b85e9c632971a735207fff7c636aede116b45581cdf72f8a3054cb351f84ae1556ce1849a0f33738989a108bba7f8f1
6
+ metadata.gz: 562641a2749e3a3e644ad7056f365b8d26566809b66606d1c6b95bf5c2db158039dd4a5b1c33677817c5f61d06786a63ec227150534a4dd5d1f958ccd1171dc9
7
+ data.tar.gz: d635ae3f69ab3e585bf0519389d9335edfe4848b4c6a41f271f957413bb663d930974f2fe386445fef6f7a5bb7ffc9d0d5c3f3bdf8849c0cc66081b7806d61a8
@@ -5,20 +5,23 @@ require 'rexml/document'
5
5
  require 'wayback_archiver/collector'
6
6
  require 'wayback_archiver/archive'
7
7
  require 'wayback_archiver/request'
8
+ require 'wayback_archiver/crawler'
8
9
 
9
10
  module WaybackArchiver
10
11
  BASE_URL = 'https://web.archive.org/save/'
11
12
 
12
13
  def self.archive(source, from = :sitemap)
13
- urls = case from
14
- when :sitemap, 'sitemap'
14
+ urls = case from.to_s
15
+ when 'sitemap'
15
16
  Collector.urls_from_sitemap("#{source}/sitemap.xml")
16
- when :url, 'url'
17
+ when 'url'
17
18
  Array(source)
18
- when :file, 'file'
19
+ when 'file'
19
20
  Collector.urls_from_file(source)
21
+ when 'crawl', 'crawler'
22
+ Crawler.collect_urls(source)
20
23
  else
21
- raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemal, url, file"
24
+ raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawler"
22
25
  end
23
26
  Archive.post(urls)
24
27
  end
@@ -6,9 +6,8 @@ module WaybackArchiver
6
6
  puts "Request will be sent with max #{MAX_THREAD_COUNT} parallel threads"
7
7
 
8
8
  puts "Total urls to be sent: #{all_urls.length}"
9
- threads = Array.new
9
+ threads = []
10
10
  group_size = (all_urls.length / MAX_THREAD_COUNT) + 1
11
-
12
11
  all_urls.each_slice(group_size).to_a.each do |urls|
13
12
  threads << Thread.new do
14
13
  urls.each_with_index do |url, index|
@@ -23,8 +22,17 @@ module WaybackArchiver
23
22
  end
24
23
  end
25
24
  end
26
- threads.each(&:join)
25
+ threads.each_with_index do |thread, index|
26
+ print_index = index + 1
27
+ progress = '['
28
+ progress << '#' * print_index
29
+ progress << ' ' * (threads.length - print_index)
30
+ progress << ']'
31
+ procent = ((print_index.to_f/threads.length.to_f) * 100).round(0)
32
+ puts "[PROGRESS] #{progress} #{procent}% (#{print_index}/#{threads.length})"
33
+ thread.join
34
+ end
35
+ all_urls
27
36
  end
28
-
29
37
  end
30
38
  end
@@ -0,0 +1,50 @@
1
+ require 'set'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+
5
+ module WaybackArchiver
6
+ class Crawler
7
+ def initialize(base_url)
8
+ @base_url = base_url
9
+ @hostname = URI.parse(@base_url).host
10
+ @fetch_queue = Set.new
11
+ @procesed = Set.new
12
+ @fetch_queue << @base_url
13
+ end
14
+
15
+ def self.collect_urls(base_url)
16
+ new(base_url).collect_urls
17
+ end
18
+
19
+ def collect_urls
20
+ until @fetch_queue.empty?
21
+ url = @fetch_queue.first
22
+ @fetch_queue.delete(@fetch_queue.first)
23
+ page_links(url)
24
+ end
25
+ puts "Crawling finished, #{@procesed.length} links found"
26
+ @procesed.to_a
27
+ end
28
+
29
+ def page_links(url)
30
+ puts "Queue length: #{@fetch_queue.length}, Parsing: #{url}"
31
+ link_elements = Nokogiri::HTML(open(url)).css('a') rescue []
32
+ @procesed << url
33
+ link_elements.each do |link|
34
+ href = sanitize_url(link.attr('href'))
35
+ @fetch_queue << href if href && !@procesed.include?(href)
36
+ end
37
+ end
38
+
39
+ def sanitize_url(raw_url)
40
+ url = URI.parse(raw_url) rescue URI.parse('')
41
+ if url.host.nil?
42
+ sanitized_url = "#{@base_url}#{url.path}"
43
+ sanitized_url += "?#{url.query}" unless url.query.nil?
44
+ sanitized_url
45
+ else
46
+ raw_url if raw_url.include?(@base_url) && @hostname.eql?(url.hostname)
47
+ end
48
+ end
49
+ end
50
+ end
@@ -1,3 +1,3 @@
1
1
  module WaybackArchiver
2
- VERSION = '0.0.4'
2
+ VERSION = '0.0.5'
3
3
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-11 00:00:00.000000000 Z
11
+ date: 2014-10-20 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: bundler
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -49,6 +63,7 @@ files:
49
63
  - bin/wayback_archiver
50
64
  - lib/wayback_archiver/archive.rb
51
65
  - lib/wayback_archiver/collector.rb
66
+ - lib/wayback_archiver/crawler.rb
52
67
  - lib/wayback_archiver/request.rb
53
68
  - lib/wayback_archiver/version.rb
54
69
  - lib/wayback_archiver.rb
@@ -77,3 +92,4 @@ signing_key:
77
92
  specification_version: 4
78
93
  summary: Send URLs to Wayback Machine
79
94
  test_files: []
95
+ has_rdoc: