wayback_archiver 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 66f995653260c6d884271708691177c6c4e1c5b2
4
- data.tar.gz: b46518ada2f179c8034bb48c41f17195e5f4551d
3
+ metadata.gz: 87e286b40825e15468832af25c42c98b59170b21
4
+ data.tar.gz: 9a7eb7e6fc6db882cf23878718626db04847b33a
5
5
  SHA512:
6
- metadata.gz: 5837775a0b99935895992589a75490859e4b3c2b089ee20e3d339e52913cfc56c1e5e055bd4650bb5678b93abf76f2d7204a27d67ea1e9403c69f20cdab1d422
7
- data.tar.gz: 5d256b29a98bfba85dad1559075885fa0b85e9c632971a735207fff7c636aede116b45581cdf72f8a3054cb351f84ae1556ce1849a0f33738989a108bba7f8f1
6
+ metadata.gz: 562641a2749e3a3e644ad7056f365b8d26566809b66606d1c6b95bf5c2db158039dd4a5b1c33677817c5f61d06786a63ec227150534a4dd5d1f958ccd1171dc9
7
+ data.tar.gz: d635ae3f69ab3e585bf0519389d9335edfe4848b4c6a41f271f957413bb663d930974f2fe386445fef6f7a5bb7ffc9d0d5c3f3bdf8849c0cc66081b7806d61a8
@@ -5,20 +5,23 @@ require 'rexml/document'
5
5
  require 'wayback_archiver/collector'
6
6
  require 'wayback_archiver/archive'
7
7
  require 'wayback_archiver/request'
8
+ require 'wayback_archiver/crawler'
8
9
 
9
10
  module WaybackArchiver
10
11
  BASE_URL = 'https://web.archive.org/save/'
11
12
 
12
13
  def self.archive(source, from = :sitemap)
13
- urls = case from
14
- when :sitemap, 'sitemap'
14
+ urls = case from.to_s
15
+ when 'sitemap'
15
16
  Collector.urls_from_sitemap("#{source}/sitemap.xml")
16
- when :url, 'url'
17
+ when 'url'
17
18
  Array(source)
18
- when :file, 'file'
19
+ when 'file'
19
20
  Collector.urls_from_file(source)
21
+ when 'crawl', 'crawler'
22
+ Crawler.collect_urls(source)
20
23
  else
21
- raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemal, url, file"
24
+ raise ArgumentError, "Unknown type: '#{from}'. Allowed types: sitemap, url, file, crawler"
22
25
  end
23
26
  Archive.post(urls)
24
27
  end
@@ -6,9 +6,8 @@ module WaybackArchiver
6
6
  puts "Request will be sent with max #{MAX_THREAD_COUNT} parallel threads"
7
7
 
8
8
  puts "Total urls to be sent: #{all_urls.length}"
9
- threads = Array.new
9
+ threads = []
10
10
  group_size = (all_urls.length / MAX_THREAD_COUNT) + 1
11
-
12
11
  all_urls.each_slice(group_size).to_a.each do |urls|
13
12
  threads << Thread.new do
14
13
  urls.each_with_index do |url, index|
@@ -23,8 +22,17 @@ module WaybackArchiver
23
22
  end
24
23
  end
25
24
  end
26
- threads.each(&:join)
25
+ threads.each_with_index do |thread, index|
26
+ print_index = index + 1
27
+ progress = '['
28
+ progress << '#' * print_index
29
+ progress << ' ' * (threads.length - print_index)
30
+ progress << ']'
31
+ procent = ((print_index.to_f/threads.length.to_f) * 100).round(0)
32
+ puts "[PROGRESS] #{progress} #{procent}% (#{print_index}/#{threads.length})"
33
+ thread.join
34
+ end
35
+ all_urls
27
36
  end
28
-
29
37
  end
30
38
  end
@@ -0,0 +1,50 @@
1
+ require 'set'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+
5
+ module WaybackArchiver
6
+ class Crawler
7
+ def initialize(base_url)
8
+ @base_url = base_url
9
+ @hostname = URI.parse(@base_url).host
10
+ @fetch_queue = Set.new
11
+ @procesed = Set.new
12
+ @fetch_queue << @base_url
13
+ end
14
+
15
+ def self.collect_urls(base_url)
16
+ new(base_url).collect_urls
17
+ end
18
+
19
+ def collect_urls
20
+ until @fetch_queue.empty?
21
+ url = @fetch_queue.first
22
+ @fetch_queue.delete(@fetch_queue.first)
23
+ page_links(url)
24
+ end
25
+ puts "Crawling finished, #{@procesed.length} links found"
26
+ @procesed.to_a
27
+ end
28
+
29
+ def page_links(url)
30
+ puts "Queue length: #{@fetch_queue.length}, Parsing: #{url}"
31
+ link_elements = Nokogiri::HTML(open(url)).css('a') rescue []
32
+ @procesed << url
33
+ link_elements.each do |link|
34
+ href = sanitize_url(link.attr('href'))
35
+ @fetch_queue << href if href && !@procesed.include?(href)
36
+ end
37
+ end
38
+
39
+ def sanitize_url(raw_url)
40
+ url = URI.parse(raw_url) rescue URI.parse('')
41
+ if url.host.nil?
42
+ sanitized_url = "#{@base_url}#{url.path}"
43
+ sanitized_url += "?#{url.query}" unless url.query.nil?
44
+ sanitized_url
45
+ else
46
+ raw_url if raw_url.include?(@base_url) && @hostname.eql?(url.hostname)
47
+ end
48
+ end
49
+ end
50
+ end
@@ -1,3 +1,3 @@
1
1
  module WaybackArchiver
2
- VERSION = '0.0.4'
2
+ VERSION = '0.0.5'
3
3
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-11 00:00:00.000000000 Z
11
+ date: 2014-10-20 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: bundler
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -49,6 +63,7 @@ files:
49
63
  - bin/wayback_archiver
50
64
  - lib/wayback_archiver/archive.rb
51
65
  - lib/wayback_archiver/collector.rb
66
+ - lib/wayback_archiver/crawler.rb
52
67
  - lib/wayback_archiver/request.rb
53
68
  - lib/wayback_archiver/version.rb
54
69
  - lib/wayback_archiver.rb
@@ -77,3 +92,4 @@ signing_key:
77
92
  specification_version: 4
78
93
  summary: Send URLs to Wayback Machine
79
94
  test_files: []
95
+ has_rdoc: