wayback_archiver 0.2.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ccebbb815d374658a9b2e1b2998a40041115d295
4
- data.tar.gz: 153afa895756670988fa3663f44fd1fd5a2e5e3e
3
+ metadata.gz: 1f9f979d5fa0d31cfdf61660baa3464bdb1425e5
4
+ data.tar.gz: 1d5701273bbe4d02b2ba5d88f9e75c9477058a28
5
5
  SHA512:
6
- metadata.gz: db7d655b1ea642618797d7fa3ece8357fa608dcb60322da76be7d793de11cd61df61f796025000ed0099edd96711aa78db607ce0614e9256e5921c252a7d931b
7
- data.tar.gz: 919d8473c7f97bbd36c9065b95a99304ace7e0556a8dcd0f92eedf00d65961400dd92c63d86e405aa97babd5d254b522384f665ff39ababdc71afed9c13333ad
6
+ metadata.gz: e69883c975584b3120993371b29a0c0b7a71f3fd4210764b4caa712d4071ec175dc47fa63217950968da62485e927c05fcc977dfb076a317be4754cf4f16ec90
7
+ data.tar.gz: 51f80b591f40f4bc22b5bf2e2b7d56273c3c495db43d8a9268f11048a410ad355a3ff238088a4abd83c69fd585db5e1e704d34d22dac3d4f4b9b2f73089458ac
data/bin/wayback_archiver CHANGED
@@ -1,14 +1,78 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ require 'optparse'
3
4
  require 'wayback_archiver'
4
5
 
5
- url = ARGV[0]
6
- from = ARGV[1]
6
+ # Default values
7
+ urls = nil
8
+ strategy = 'auto'
9
+ log = STDOUT
10
+ log_level = Logger::INFO
11
+ concurrency = WaybackArchiver.concurrency
7
12
 
8
- WaybackArchiver.logger = Logger.new(STDOUT)
13
+ optparse = OptionParser.new do |parser|
14
+ parser.banner = 'Usage: wayback_archiver [<url>] [options]'
9
15
 
10
- if from.nil?
11
- WaybackArchiver.archive(url)
12
- else
13
- WaybackArchiver.archive(url, from)
16
+ parser.on('--auto', 'Auto (default)') do |value|
17
+ strategy = 'auto'
18
+ end
19
+
20
+ parser.on('--crawl', 'Crawl') do |value|
21
+ strategy = 'crawl'
22
+ end
23
+
24
+ parser.on('--sitemap', 'Sitemap') do |value|
25
+ strategy = 'sitemap'
26
+ end
27
+
28
+ parser.on('--urls', '--url', 'URL(s)') do |value|
29
+ strategy = 'urls'
30
+ end
31
+
32
+ parser.on('--concurrency=5', Integer, 'Concurrency') do |value|
33
+ concurrency = value
34
+ end
35
+
36
+ parser.on('--log=output.log', String, 'Path to desired log file (if no argument is given it defaults to STDOUT)') do |path|
37
+ log = path
38
+ end
39
+
40
+ parser.on('--[no-]verbose', 'Verboes logs') do |value|
41
+ log_level = value ? Logger::DEBUG : Logger::WARN
42
+ end
43
+
44
+ parser.on('-h', '--help', 'How to use') do
45
+ puts parser
46
+ exit
47
+ end
48
+
49
+ # No argument, shows at tail. This will print an options summary.
50
+ parser.on_tail('-h', '--help', 'Show this message') do
51
+ puts parser
52
+ exit
53
+ end
54
+
55
+ parser.on_tail('--version', 'Show version') do
56
+ puts "WaybackArchiver version #{WaybackArchiver::VERSION}"
57
+ exit
58
+ end
59
+ end
60
+
61
+ optparse.parse!
62
+
63
+ urls = ARGV.map(&:strip).reject(&:empty?)
64
+ if urls.empty?
65
+ puts optparse.help
66
+ raise ArgumentError, "[<url>] is required"
67
+ end
68
+
69
+ WaybackArchiver.logger = Logger.new(log).tap do |logger|
70
+ logger.progname = 'WaybackArchiver'
71
+ logger.level = log_level
72
+ end
73
+
74
+ # If no strategy has explicitly been given, then default to 'auto'
75
+ strategy ||= 'auto'
76
+ urls.each do |url|
77
+ WaybackArchiver.archive(url, strategy: strategy, concurrency: concurrency)
14
78
  end
@@ -1,60 +1,155 @@
1
- require 'uri'
2
- require 'net/http'
3
-
4
- require 'concurrent'
5
-
1
+ require 'wayback_archiver/thread_pool'
6
2
  require 'wayback_archiver/null_logger'
7
3
  require 'wayback_archiver/version'
8
4
  require 'wayback_archiver/url_collector'
9
5
  require 'wayback_archiver/archive'
10
- require 'wayback_archiver/request'
6
+ require 'wayback_archiver/sitemapper'
11
7
 
12
- # WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
8
+ # WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap or by passing a list of URLs.
13
9
  module WaybackArchiver
14
10
  # Link to gem on rubygems.org, part of the sent User-Agent
15
11
  INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'.freeze
16
12
  # WaybackArchiver User-Agent
17
13
  USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze
18
14
 
15
+ # Default concurrency for archiving URLs
16
+ DEFAULT_CONCURRENCY = 5
17
+
19
18
  # Send URLs to Wayback Machine.
20
- # @return [Array] with URLs sent to the Wayback Machine.
21
- # @param [String] source for URL(s).
22
- # @param [String/Symbol] type of source. Supported types: ['crawl', 'sitemap', 'url', 'file'].
19
+ # @return [Array<String>] of URLs sent to the Wayback Machine.
20
+ # @param [String/Array<String>] source for URL(s).
21
+ # @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto.
23
22
  # @example Crawl example.com and send all URLs of the same domain
24
- # WaybackArchiver.archive('example.com') # Default type is :crawl
23
+ # WaybackArchiver.archive('example.com') # Default strategy is :auto
24
+ # WaybackArchiver.archive('example.com', strategy: :auto)
25
+ # WaybackArchiver.archive('example.com', strategy: :auto, concurrency: 10)
26
+ # WaybackArchiver.archive('example.com', :auto)
27
+ # @example Crawl example.com and send all URLs of the same domain
28
+ # WaybackArchiver.archive('example.com', strategy: :crawl)
29
+ # WaybackArchiver.archive('example.com', strategy: :crawl, concurrency: 10)
25
30
  # WaybackArchiver.archive('example.com', :crawl)
31
+ # @example Send example.com Sitemap URLs
32
+ # WaybackArchiver.archive('example.com', strategy: :sitemap)
33
+ # WaybackArchiver.archive('example.com', strategy: :sitemap, concurrency: 10)
34
+ # WaybackArchiver.archive('example.com', :sitemap)
26
35
  # @example Send only example.com
36
+ # WaybackArchiver.archive('example.com', strategy: :url)
37
+ # WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
27
38
  # WaybackArchiver.archive('example.com', :url)
28
- # @example Send URL on each line in specified file
29
- # WaybackArchiver.archive('/path/to/file', :file)
30
- def self.archive(source, type = :crawl)
31
- case type.to_s
32
- when 'file' then Archive.post(UrlCollector.file(source))
33
- when 'crawl' then crawl(source)
34
- when 'sitemap' then Archive.post(UrlCollector.sitemap(source))
35
- when 'url' then Archive.post_url(Request.resolve_url(source))
39
+ def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency)
40
+ strategy = legacy_strategy || strategy
41
+
42
+ case strategy.to_s
43
+ when 'crawl' then crawl(source, concurrency: concurrency)
44
+ when 'auto' then auto(source, concurrency: concurrency)
45
+ when 'sitemap' then sitemap(source, concurrency: concurrency)
46
+ when 'urls' then urls(source, concurrency: concurrency)
47
+ when 'url' then urls(source, concurrency: concurrency)
36
48
  else
37
- raise ArgumentError, "Unknown type: '#{type}'. Allowed types: sitemap, url, file, crawl"
49
+ raise ArgumentError, "Unknown strategy: '#{strategy}'. Allowed strategies: sitemap, urls, url, crawl"
38
50
  end
39
51
  end
40
52
 
53
+ # Look for Sitemap(s) and if nothing is found fallback to crawling.
54
+ # Then send found URLs to the Wayback Machine.
55
+ # @return [Array<String>] of URLs sent to the Wayback Machine.
56
+ # @param [String] source (must be a valid URL).
57
+ # @param concurrency [Integer]
58
+ # @example Auto archive example.com
59
+ # WaybackArchiver.auto('example.com') # Default concurrency is 5
60
+ # @example Auto archive example.com with low concurrency
61
+ # WaybackArchiver.auto('example.com', concurrency: 1)
62
+ # @see http://www.sitemaps.org
63
+ def self.auto(source, concurrency: WaybackArchiver.concurrency)
64
+ urls = Sitemapper.autodiscover(source)
65
+ return urls(urls, concurrency: concurrency) if urls.any?
66
+
67
+ crawl(source, concurrency: concurrency)
68
+ end
69
+
41
70
  # Crawl site for URLs to send to the Wayback Machine.
42
- # @return [Array] with URLs sent to the Wayback Machine.
43
- # @param [String] source for URL(s).
44
- # @param [Integer] concurrency.
71
+ # @return [Array<String>] of URLs sent to the Wayback Machine.
72
+ # @param [String] url to start crawling from.
73
+ # @param concurrency [Integer]
45
74
  # @example Crawl example.com and send all URLs of the same domain
46
75
  # WaybackArchiver.crawl('example.com') # Default concurrency is 5
47
76
  # @example Crawl example.com and send all URLs of the same domain with low concurrency
48
77
  # WaybackArchiver.crawl('example.com', concurrency: 1)
49
- def self.crawl(source, concurrency: Archive::DEFAULT_CONCURRENCY)
50
- Archive.crawl(source, concurrency: concurrency)
78
+ def self.crawl(url, concurrency: WaybackArchiver.concurrency)
79
+ WaybackArchiver.logger.info "Crawling #{url}"
80
+ Archive.crawl(url, concurrency: concurrency)
81
+ end
82
+
83
+ # Get URLs from sitemap and send found URLs to the Wayback Machine.
84
+ # @return [Array<String>] of URLs sent to the Wayback Machine.
85
+ # @param [String] url to the sitemap.
86
+ # @param concurrency [Integer]
87
+ # @example Get example.com sitemap and archive all found URLs
88
+ # WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 5
89
+ # @example Get example.com sitemap and archive all found URLs with low concurrency
90
+ # WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1)
91
+ # @see http://www.sitemaps.org
92
+ def self.sitemap(url, concurrency: WaybackArchiver.concurrency)
93
+ WaybackArchiver.logger.info "Fetching Sitemap"
94
+ Archive.post(URLCollector.sitemap(url), concurrency: concurrency)
95
+ end
96
+
97
+ # Send URL to the Wayback Machine.
98
+ # @return [Array<String>] of URLs sent to the Wayback Machine.
99
+ # @param [Array<String>/String] urls or url.
100
+ # @param concurrency [Integer]
101
+ # @example Archive example.com
102
+ # WaybackArchiver.urls('example.com')
103
+ # @example Archive example.com and google.com
104
+ # WaybackArchiver.urls(%w(example.com google.com))
105
+ def self.urls(urls, concurrency: WaybackArchiver.concurrency)
106
+ Archive.post(Array(urls), concurrency: concurrency)
51
107
  end
52
108
 
109
+ # Set logger
110
+ # @return [Object] the set logger
111
+ # @param [Object] logger an object than response to quacks like a Logger
112
+ # @example set a logger that prints to standard out (STDOUT)
113
+ # WaybackArchiver.logger = Logger.new(STDOUT)
53
114
  def self.logger=(logger)
54
115
  @logger = logger
55
116
  end
56
117
 
118
+ # Returns the current logger
119
+ # @return [Object] the current logger instance
57
120
  def self.logger
58
121
  @logger ||= NullLogger.new
59
122
  end
123
+
124
+ # Resets the logger to the default
125
+ # @return [NullLogger] a new instance of NullLogger
126
+ def self.default_logger!
127
+ @logger = NullLogger.new
128
+ end
129
+
130
+ # Sets the user agent
131
+ # @return [String] the configured user agent
132
+ # @param [String] user_agent the desired user agent
133
+ def self.user_agent=(user_agent)
134
+ @user_agent = user_agent
135
+ end
136
+
137
+ # Returns the configured user agent
138
+ # @return [String] the configured or the default user agent
139
+ def self.user_agent
140
+ @user_agent ||= USER_AGENT
141
+ end
142
+
143
+ # Sets the default concurrency
144
+ # @return [Integer] the desired default concurrency
145
+ # @param [Integer] concurrency the desired default concurrency
146
+ def self.concurrency=(concurrency)
147
+ @concurrency = concurrency
148
+ end
149
+
150
+ # Returns the default concurrency
151
+ # @return [Integer] the configured or the default concurrency
152
+ def self.concurrency
153
+ @concurrency ||= DEFAULT_CONCURRENCY
154
+ end
60
155
  end
@@ -1,46 +1,68 @@
1
+ require 'concurrent'
2
+
3
+ require 'wayback_archiver/thread_pool'
4
+ require 'wayback_archiver/request'
5
+
1
6
  module WaybackArchiver
2
7
  # Post URL(s) to Wayback Machine
3
8
  class Archive
4
9
  # Wayback Machine base URL.
5
10
  WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
6
- # Default concurrency for archiving URLs
7
- DEFAULT_CONCURRENCY = 5
11
+
8
12
  # Send URLs to Wayback Machine.
9
- # @return [Array] with sent URLs.
10
- # @param [Array] urls URLs to send.
11
- # @param [Hash] options
13
+ # @return [Array<String>] with sent URLs.
14
+ # @param [Array<String>] urls to send to the Wayback Machine.
15
+ # @param concurrency [Integer] the default is 5
12
16
  # @example Archive urls, asynchronously
13
17
  # Archive.post(['http://example.com'])
14
18
  # @example Archive urls, using only 1 thread
15
19
  # Archive.post(['http://example.com'], concurrency: 1)
16
- def self.post(urls, concurrency: DEFAULT_CONCURRENCY)
17
- WaybackArchiver.logger.info "=== WAYBACK ARCHIVER ==="
20
+ def self.post(urls, concurrency: WaybackArchiver.concurrency)
21
+ WaybackArchiver.logger.info "Total URLs to be sent: #{urls.length}"
18
22
  WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
19
- WaybackArchiver.logger.info "Total urls to be sent: #{urls.length}"
20
23
 
21
- pool = Concurrent::FixedThreadPool.new(concurrency)
24
+ posted_urls = Concurrent::Array.new
25
+ pool = ThreadPool.build(concurrency)
22
26
  urls.each do |url|
23
- pool.post { Archive.post_url(url) }
27
+ pool.post do
28
+ posted_url = post_url(url)
29
+ posted_urls << posted_url if posted_url
30
+ end
24
31
  end
25
32
 
26
- WaybackArchiver.logger.info "#{urls.length} URLs sent to Internet archive"
27
- urls
33
+ pool.shutdown
34
+ pool.wait_for_termination
35
+
36
+ WaybackArchiver.logger.info "#{posted_urls.length} URL(s) posted to Wayback Machine"
37
+ posted_urls
28
38
  end
29
39
 
30
40
  # Send URLs to Wayback Machine by crawling the site.
31
- # @return [Array] with URLs sent to the Wayback Machine.
41
+ # @return [Array<String>] with URLs sent to the Wayback Machine.
32
42
  # @param [String] source for URL to crawl.
33
- # @param [Integer] concurrency (default is 5).
43
+ # @param concurrency [Integer] the default is 5
34
44
  # @example Crawl example.com and send all URLs of the same domain
35
45
  # WaybackArchiver.crawl('example.com')
36
46
  # @example Crawl example.com and send all URLs of the same domain with low concurrency
37
47
  # WaybackArchiver.crawl('example.com', concurrency: 1)
38
- def self.crawl(source, concurrency: DEFAULT_CONCURRENCY)
39
- pool = Concurrent::FixedThreadPool.new(concurrency) # X threads
48
+ def self.crawl(source, concurrency: WaybackArchiver.concurrency)
49
+ WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
50
+
51
+ posted_urls = Concurrent::Array.new
52
+ pool = ThreadPool.build(concurrency)
40
53
 
41
- UrlCollector.crawl(source) do |url|
42
- pool.post { Archive.post_url(url) }
54
+ found_urls = URLCollector.crawl(source) do |url|
55
+ pool.post do
56
+ posted_url = post_url(url)
57
+ posted_urls << posted_url if posted_url
58
+ end
43
59
  end
60
+ WaybackArchiver.logger.info "Crawling of #{source} finished, found #{found_urls.length} URL(s)"
61
+ pool.shutdown
62
+ pool.wait_for_termination
63
+
64
+ WaybackArchiver.logger.info "#{posted_urls.length} URL(s) posted to Wayback Machine"
65
+ posted_urls
44
66
  end
45
67
 
46
68
  # Send URL to Wayback Machine.
@@ -50,12 +72,12 @@ module WaybackArchiver
50
72
  # Archive.post_url('http://example.com')
51
73
  def self.post_url(url)
52
74
  request_url = "#{WAYBACK_BASE_URL}#{url}"
53
- response = Request.response(request_url)
54
- WaybackArchiver.logger.info "[#{response.code}, #{response.message}] #{url}"
75
+ response = Request.get(request_url, follow_redirects: false)
76
+ WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
55
77
  url
56
- rescue Exception => e
57
- WaybackArchiver.logger.error "Error message: #{e.message}"
58
- WaybackArchiver.logger.error "Failed to archive: #{url}"
78
+ rescue Request::Error => e
79
+ WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
80
+ nil
59
81
  end
60
82
  end
61
83
  end
@@ -0,0 +1,49 @@
1
+ module WaybackArchiver
2
+ # Convience class for HTTP response codes
3
+ class HTTPCode
4
+ # Type of code as symbol
5
+ # @return [Symbol] code type
6
+ # @param [String/Integer] code the response code
7
+ # @example
8
+ # HttpCode.type('200')
9
+ def self.type(code)
10
+ code = code.to_s
11
+ return :success if success?(code)
12
+ return :redirect if redirect?(code)
13
+ return :error if error?(code)
14
+
15
+ :unknown
16
+ end
17
+
18
+ # Whether the code is a success type
19
+ # @return [Boolean] is success or not
20
+ # @param [String] code the response code
21
+ # @example
22
+ # HttpCode.success?('200') # => true
23
+ # @example
24
+ # HttpCode.success?(200) # => true
25
+ # @example
26
+ # HttpCode.success?(nil) # => false
27
+ def self.success?(code)
28
+ code.to_s.match?(/2\d\d/)
29
+ end
30
+
31
+ # Whether the code is a redirect type
32
+ # @return [Boolean] is redirect or not
33
+ # @param [String] code the response code
34
+ # @example
35
+ # HttpCode.redirect?('301')
36
+ def self.redirect?(code)
37
+ code.to_s.match?(/3\d\d/)
38
+ end
39
+
40
+ # Whether the code is a error type
41
+ # @return [Boolean] is error or not
42
+ # @param [String] code the response code
43
+ # @example
44
+ # HttpCode.error?('301')
45
+ def self.error?(code)
46
+ code.to_s.match?(/4\d\d/) || code.to_s.match?(/5\d\d/)
47
+ end
48
+ end
49
+ end
@@ -1,9 +1,12 @@
1
1
  require 'logger'
2
2
 
3
- class NullLogger < Logger
4
- def initialize(*args)
5
- end
3
+ module WaybackArchiver
4
+ # Don't log anyting / Send the logs to the abyss
5
+ class NullLogger < Logger
6
+ # Allow any and all params
7
+ def initialize(*args); end
6
8
 
7
- def add(*args, &block)
9
+ # Allow any and alls params and don't do anyting
10
+ def add(*args, &block); end
8
11
  end
9
12
  end
@@ -1,62 +1,223 @@
1
- require 'url_resolver' # TODO: Allow users to use any resolver
1
+ require 'net/http'
2
+ require 'openssl'
3
+ require 'timeout'
4
+ require 'uri'
5
+ require 'zlib'
6
+
7
+ require 'wayback_archiver/http_code'
2
8
 
3
9
  module WaybackArchiver
4
- # Request and parse HTML & XML documents
10
+ # Make HTTP requests
5
11
  class Request
6
- # Get and parse HTML & XML documents.
7
- # @return [Array] with links sent to the Wayback Machine.
8
- # @param [String] url to retrieve and parse.
9
- # @example Request and parse example.com
10
- # Request.document('example.com')
11
- # @example Request and parse google.com/sitemap.xml
12
- # Request.document('google.com/sitemap.xml')
13
- def self.document(url)
14
- response_body = Request.response(url).body
15
- Nokogiri::HTML(response_body)
12
+ # General error, something went wrong
13
+ class Error < StandardError; end
14
+ # Client error, something went wrong on the local machine
15
+ class ClientError < Error; end
16
+ # Server error, the remote server did something wrong
17
+ class ServerError < Error; end
18
+ # Remote server responded with a HTTP error
19
+ class HTTPError < ServerError; end
20
+ # Remote server error
21
+ class ResponseError < ServerError; end
22
+ # Max redirects reached error
23
+ class MaxRedirectError < ServerError; end
24
+ # Remote server responded with an invalid redirect
25
+ class InvalidRedirectError < ServerError; end
26
+ # Remote server responded with an unknown HTTP code
27
+ class UnknownResponseCodeError < ServerError; end
28
+
29
+ # GET response wrapper
30
+ GETStruct = Struct.new(:response, :error)
31
+
32
+ # Max number of redirects before an error is raised
33
+ MAX_REDIRECTS = 10
34
+
35
+ # Response data struct
36
+ Response = Struct.new(:code, :message, :body, :uri, :error)
37
+ class Response
38
+ # Returns true if a successfull response
39
+ # @example check if Response was successfull
40
+ # response = Response.new('200', 'OK', 'buren', 'http://example.com')
41
+ # response.success? # => true
42
+ def success?
43
+ HTTPCode.success?(code)
44
+ end
16
45
  end
17
46
 
18
47
  # Get reponse.
19
- # @return [Net::HTTP*] the http response.
20
- # @param [String] url URL to retrieve.
21
- # @param [Boolean] resolve whether to resolve the URL.
22
- # @example Resolve example.com and request
23
- # Request.response('example.com', true)
24
- # @example Request http://example.com
25
- # Request.response('http://example.com', false)
26
- def self.response(url, resolve = true)
27
- resolved_url = resolve ? resolve_url(url) : url
28
- uri = URI.parse(resolved_url)
29
- http = Net::HTTP.new(uri.host, uri.port)
30
- http.use_ssl = true if resolved_url.start_with?('https://')
31
-
32
- request = Net::HTTP::Get.new(uri.request_uri)
33
- request['User-Agent'] = WaybackArchiver::USER_AGENT
34
- http.request(request)
35
- end
36
-
37
- # Resolve the URL, follows redirects.
38
- # @return [String] the resolved URL.
39
- # @param [String] url to retrieve.
40
- # @example Resolve example.com and request
41
- # Request.resolve_url('example.com')
42
- def self.resolve_url(url)
43
- resolved = UrlResolver.resolve(url)
44
- resolved = resolved.prepend('http://') unless protocol?(resolved)
45
- resolved
46
- end
47
-
48
- # Resolve the URL, follows redirects.
49
- # @return [Boolean] true if string includes protocol.
50
- # @param [String] url to check.
51
- # @example Check if string includes protocol
52
- # Request.protocol?('example.com')
53
- # # => false
54
- # Request.protocol?('https://example.com')
55
- # # => true
56
- # Request.protocol?('http://example.com')
57
- # # => true
58
- def self.protocol?(url)
59
- url.start_with?('http://') || url.start_with?('https://')
48
+ # @return [Response] the http response representation.
49
+ # @param [String, URI] uri to retrieve.
50
+ # @param max_redirects [Integer] max redirects (default: 10).
51
+ # @param follow_redirects [Boolean] follow redirects (default: true).
52
+ # @example Get example.com
53
+ # Request.get('example.com')
54
+ # @example Get http://example.com and follow max 3 redirects
55
+ # Request.get('http://example.com', max_redirects: 3)
56
+ # @example Get http://example.com and don't follow redirects
57
+ # Request.get('http://example.com', follow_redirects: false)
58
+ # @raise [Error] super class of all exceptions that this method can raise
59
+ # @raise [ServerError] all server errors
60
+ # @raise [ClientError] all client errors
61
+ # @raise [HTTPError] all HTTP errors
62
+ # @raise [MaxRedirectError] too many redirects, subclass of HTTPError (only raised if raise_on_http_error flag is true)
63
+ # @raise [ResponseError] server responsed with a 4xx or 5xx HTTP status code, subclass of HTTPError (only raised if raise_on_http_error flag is true)
64
+ # @raise [UnknownResponseCodeError] server responded with an unknown HTTP status code, subclass of HTTPError (only raised if raise_on_http_error flag is true)
65
+ # @raise [InvalidRedirectError] server responded with an invalid redirect, subclass of HTTPError (only raised if raise_on_http_error flag is true)
66
+ def self.get(
67
+ uri,
68
+ max_redirects: MAX_REDIRECTS,
69
+ raise_on_http_error: false,
70
+ follow_redirects: true
71
+ )
72
+ uri = build_uri(uri)
73
+
74
+ redirect_count = 0
75
+ until redirect_count > max_redirects
76
+ WaybackArchiver.logger.debug "Requesting #{uri}"
77
+
78
+ http = Net::HTTP.new(uri.host, uri.port)
79
+ if uri.scheme == 'https'
80
+ http.use_ssl = true
81
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
82
+ end
83
+
84
+ request = Net::HTTP::Get.new(uri.request_uri)
85
+ request['User-Agent'] = WaybackArchiver.user_agent
86
+
87
+ result = perform_request(uri, http, request)
88
+ response = result.response
89
+ error = result.error
90
+
91
+ raise error if error
92
+
93
+ code = response.code
94
+ WaybackArchiver.logger.debug "[#{code}, #{response.message}] Requested #{uri}"
95
+
96
+ case HTTPCode.type(code)
97
+ when :success
98
+ return build_response(uri, response)
99
+ when :redirect
100
+ return build_response(uri, response) unless follow_redirects
101
+
102
+ uri = build_redirect_uri(uri, response)
103
+ redirect_count += 1
104
+ next
105
+ when :error
106
+ if raise_on_http_error
107
+ raise ResponseError, "Failed with response code: #{code} when requesting #{uri}"
108
+ end
109
+
110
+ return build_response(uri, response)
111
+ else
112
+ raise UnknownResponseCodeError, "Unknown HTTP response code #{code} when requesting #{uri}"
113
+ end
114
+ end
115
+
116
+ raise MaxRedirectError, "Redirected too many times when requesting #{uri}"
117
+ end
118
+
119
+ # Builds a Response object.
120
+ # @return [Response]
121
+ # @param [URI] uri that was requested.
122
+ # @param [Net::HTTPResponse] response the server response.
123
+ # @example Build Response object for example.com
124
+ # Request.build_response(uri, net_http_response)
125
+ def self.build_response(uri, response)
126
+ Response.new(
127
+ response.code,
128
+ response.message,
129
+ parse_body(response.body),
130
+ uri.to_s
131
+ )
132
+ end
133
+
134
+ # Builds an URI for a redirect response.
135
+ # @return [URI] to redirect to.
136
+ # @param [URI] uri that was requested.
137
+ # @param [Net::HTTPResponse] response the server response.
138
+ # @example Build redirect URI for example.com (lets pretend it will redirect..)
139
+ # Request.build_redirect_uri('http://example.com', net_http_response)
140
+ def self.build_redirect_uri(uri, response)
141
+ location_header = response.header.fetch('location') do
142
+ raise InvalidRedirectError, "No location header found on redirect when requesting #{uri}"
143
+ end
144
+
145
+ location = URI.parse(location_header)
146
+ return build_uri(uri) + location_header if location.relative?
147
+
148
+ location
149
+ end
150
+
151
+ # Build URI.
152
+ # @return [URI] uri to redirect to.
153
+ # @param [URI, String] uri to build.
154
+ # @example Build URI for example.com
155
+ # Request.build_uri('http://example.com')
156
+ # @example Build URI for #<URI::HTTP http://example.com>
157
+ # uri = URI.parse('http://example.com')
158
+ # Request.build_uri(uri)
159
+ def self.build_uri(uri)
160
+ return uri if uri.is_a?(URI)
161
+
162
+ uri = "http://#{uri}" unless uri =~ %r{^https?://}
163
+ URI.parse(uri)
164
+ end
165
+
166
+ # Parse response body, handles reqular and gzipped response bodies.
167
+ # @return [String] the response body.
168
+ # @param [String] response_body the server response body.
169
+ # @example Return response body for response.
170
+ # Request.parse_body(uri, net_http_response)
171
+ def self.parse_body(response_body)
172
+ return '' unless response_body
173
+
174
+ Zlib::GzipReader.new(StringIO.new(response_body)).read
175
+ rescue Zlib::GzipFile::Error => _e
176
+ response_body
177
+ end
178
+
179
+ # Return whether a value is blank or not.
180
+ # @return [Boolean] whether the value is blank or not.
181
+ # @param [Object] value the value to check if its blank or not.
182
+ # @example Returns false for nil.
183
+ # Request.blank?(nil)
184
+ # @example Returns false for empty string.
185
+ # Request.blank?('')
186
+ # @example Returns false for string with only spaces.
187
+ # Request.blank?(' ')
188
+ def self.blank?(value)
189
+ return true unless value
190
+ return true if value.strip.empty?
191
+
192
+ false
193
+ end
194
+
195
+ private
196
+
197
+ def self.perform_request(uri, http, request)
198
+ # TODO: Consider retrying failed requests
199
+ response = http.request(request)
200
+ GETStruct.new(response)
201
+ rescue Timeout::Error,
202
+ OpenSSL::SSL::SSLError,
203
+ Net::HTTPBadResponse,
204
+ Zlib::Error => e
205
+
206
+ build_request_error(uri, e, ServerError)
207
+ rescue SystemCallError,
208
+ SocketError,
209
+ IOError => e
210
+
211
+ build_request_error(uri, e, ClientError)
212
+ end
213
+
214
+ def self.build_request_error(uri, error, error_wrapper_klass)
215
+ WaybackArchiver.logger.error "Request to #{uri} failed: #{error_wrapper_klass}, #{error.class}, #{error.message}"
216
+
217
+ GETStruct.new(
218
+ Response.new,
219
+ error_wrapper_klass.new("#{error.class}, #{error.message}")
220
+ )
60
221
  end
61
222
  end
62
223
  end
@@ -0,0 +1,79 @@
1
+ require 'rexml/document'
2
+
3
+ module WaybackArchiver
4
+ # Parse Sitemaps, https://www.sitemaps.org
5
+ class Sitemap
6
+ attr_reader :document
7
+
8
+ def initialize(xml, strict: false)
9
+ @document = REXML::Document.new(xml)
10
+ rescue REXML::ParseException => _e
11
+ raise if strict
12
+
13
+ @document = REXML::Document.new('')
14
+ end
15
+
16
+ # Return all URLs defined in Sitemap.
17
+ # @return [Array<String>] of URLs defined in Sitemap.
18
+ # @example Get URLs defined in Sitemap
19
+ # sitemap = Sitemap.new(xml)
20
+ # sitemap.urls
21
+ def urls
22
+ @urls ||= extract_urls('url')
23
+ end
24
+
25
+ # Return all sitemap URLs defined in Sitemap.
26
+ # @return [Array<String>] of Sitemap URLs defined in Sitemap.
27
+ # @example Get Sitemap URLs defined in Sitemap
28
+ # sitemap = Sitemap.new(xml)
29
+ # sitemap.sitemaps
30
+ def sitemaps
31
+ @sitemaps ||= extract_urls('sitemap')
32
+ end
33
+
34
+ # Check if sitemap is a plain file
35
+ # @return [Boolean] whether document is plain
36
+ def plain_document?
37
+ document.elements.empty?
38
+ end
39
+
40
+ # Return the name of the document (if there is one)
41
+ # @return [String] the document root name
42
+ def root_name
43
+ return unless document.root
44
+
45
+ document.root.name
46
+ end
47
+
48
+ # Returns true of Sitemap is a Sitemap index
49
+ # @return [Boolean] of whether the Sitemap is an Sitemap index or not
50
+ # @example Check if Sitemap is a sitemap index
51
+ # sitemap = Sitemap.new(xml)
52
+ # sitemap.sitemap_index?
53
+ def sitemap_index?
54
+ root_name == 'sitemapindex'
55
+ end
56
+
57
+ # Returns true of Sitemap lists regular URLs
58
+ # @return [Boolean] of whether the Sitemap regular URL list
59
+ # @example Check if Sitemap is a regular URL list
60
+ # sitemap = Sitemap.new(xml)
61
+ # sitemap.urlset?
62
+ def urlset?
63
+ root_name == 'urlset'
64
+ end
65
+
66
+ private
67
+
68
+ # Extract URLs from Sitemap
69
+ def extract_urls(node_name)
70
+ return document.to_s.each_line.map(&:strip) if plain_document?
71
+
72
+ urls = []
73
+ document.root.elements.each("#{node_name}/loc") do |element|
74
+ urls << element.text
75
+ end
76
+ urls
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,75 @@
1
+ require 'robots'
2
+
3
+ require 'wayback_archiver/sitemap'
4
+ require 'wayback_archiver/request'
5
+
6
+ module WaybackArchiver
7
+ # Fetch and parse sitemaps recursively
8
+ class Sitemapper
9
+ # Common locations for Sitemap(s)
10
+ COMMON_SITEMAP_LOCATIONS = %w[
11
+ sitemap_index.xml.gz
12
+ sitemap-index.xml.gz
13
+ sitemap_index.xml
14
+ sitemap-index.xml
15
+ sitemap.xml.gz
16
+ sitemap.xml
17
+ ].freeze
18
+
19
+ # Autodiscover the location of the Sitemap, then fetch and parse recursively.
20
+ # First it tries /robots.txt, then common locations for Sitemap and finally the supplied URL.
21
+ # @return [Array<String>] of URLs defined in Sitemap(s).
22
+ # @param [URI] url to domain.
23
+ # @example Get URLs defined in Sitemap for google.com
24
+ # Sitemapper.autodiscover('https://google.com/')
25
+ # @see http://www.sitemaps.org
26
+ def self.autodiscover(url)
27
+ WaybackArchiver.logger.info 'Looking for Sitemap(s) in /robots.txt'
28
+ robots = Robots.new(WaybackArchiver.user_agent)
29
+ sitemaps = robots.other_values(url)['Sitemap']
30
+ if sitemaps
31
+ return sitemaps.flat_map do |sitemap|
32
+ WaybackArchiver.logger.info "Fetching Sitemap at #{sitemap}"
33
+ urls(url: sitemap)
34
+ end
35
+ end
36
+
37
+ COMMON_SITEMAP_LOCATIONS.each do |path|
38
+ WaybackArchiver.logger.info "Looking for Sitemap at #{path}"
39
+ sitemap_url = [url, path].join(url.end_with?('/') ? '' : '/')
40
+ response = Request.get(sitemap_url, raise_on_http_error: false)
41
+ return urls(xml: response.body) if response.success?
42
+ end
43
+
44
+ WaybackArchiver.logger.info "Looking for Sitemap at #{url}"
45
+ urls(url: url)
46
+ rescue Request::Error => e
47
+ WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
48
+ []
49
+ end
50
+
51
+ # Fetch and parse sitemaps recursively.
52
+ # @return [Array<String>] of URLs defined in Sitemap(s).
53
+ # @param url [String] URL to Sitemap.
54
+ # @param xml [String] Sitemap XML.
55
+ # @example Get URLs defined in Sitemap for google.com
56
+ # Sitemapper.urls(url: 'https://google.com/sitemap.xml')
57
+ # @example Get URLs defined in Sitemap
58
+ # Sitemapper.urls(xml: xml)
59
+ # @see http://www.sitemaps.org
60
+ def self.urls(url: nil, xml: nil)
61
+ xml = Request.get(url).body unless xml
62
+ sitemap = Sitemap.new(xml)
63
+
64
+ if sitemap.sitemap_index?
65
+ sitemap.sitemaps.flat_map { |sitemap_url| urls(url: sitemap_url) }
66
+ else
67
+ sitemap.urls
68
+ end
69
+ rescue Request::Error => e
70
+ WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
71
+
72
+ []
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,26 @@
1
+ require 'concurrent'
2
+
3
+ module WaybackArchiver
4
+ # Thread pool
5
+ class ThreadPool
6
+ # Build a thread pool
7
+ # @return [Concurrent::FixedThreadPool/Concurrent::ImmediateExecutor] an instance of a concurrent thread pool
8
+ # @param [Integer] concurrency the desired concurrency
9
+ # @example Build a thread pool with 10 as the desired concurrency
10
+ # pool = ThreadPool.build(10)
11
+ # pool.post { some_work } # Returns a Concurrent::FixedThreadPool
12
+ # @example Build a thread pool with 1 as the desired concurrency
13
+ # pool = ThreadPool.build(1)
14
+ # pool.post { some_work } # Returns a Concurrent::ImmediateExecutor
15
+ # @see https://github.com/ruby-concurrency/concurrent-ruby/blob/master/doc/thread_pools.md
16
+ def self.build(concurrency)
17
+ if concurrency == 1
18
+ Concurrent::ImmediateExecutor.new
19
+ elsif concurrency > 1
20
+ Concurrent::FixedThreadPool.new(concurrency)
21
+ else
22
+ raise ArgumentError, 'concurrency must be one or greater'
23
+ end
24
+ end
25
+ end
26
+ end
@@ -1,51 +1,42 @@
1
1
  require 'spidr'
2
2
  require 'robots'
3
3
 
4
+ require 'wayback_archiver/sitemapper'
5
+ require 'wayback_archiver/request'
6
+
4
7
  module WaybackArchiver
5
8
  # Retrive URLs from different sources
6
- class UrlCollector
9
+ class URLCollector
7
10
  # Retrieve URLs from Sitemap.
8
- # @return [Array] of URLs defined in Sitemap.
11
+ # @return [Array<String>] of URLs defined in Sitemap.
9
12
  # @param [String] url domain to retrieve Sitemap from.
10
13
  # @example Get URLs defined in Sitemap for google.com
11
- # UrlCollector.sitemap('https://google.com')
14
+ # URLCollector.sitemap('https://google.com/sitemap.xml')
12
15
  def self.sitemap(url)
13
- resolved = Request.resolve_url("#{url}/sitemap.xml")
14
- sitemap = Request.document(resolved)
15
- sitemap.css('loc').map(&:text)
16
+ Sitemapper.urls(url: Request.build_uri(url))
16
17
  end
17
18
 
18
19
  # Retrieve URLs by crawling.
19
- # @return [Array] of URLs defined found during crawl.
20
+ # @return [Array<String>] of URLs defined found during crawl.
20
21
  # @param [String] url domain to crawl URLs from.
21
22
  # @example Crawl URLs defined on example.com
22
- # UrlCollector.crawl('http://example.com')
23
+ # URLCollector.crawl('http://example.com')
23
24
  def self.crawl(url)
24
25
  urls = []
25
- resolved_url = Request.resolve_url(url)
26
- Spidr.site(resolved_url, robots: true) do |spider|
26
+ start_at_url = Request.build_uri(url).to_s
27
+ options = {
28
+ robots: true,
29
+ user_agent: WaybackArchiver.user_agent
30
+ }
31
+ Spidr.site(start_at_url, options) do |spider|
27
32
  spider.every_html_page do |page|
28
33
  page_url = page.url.to_s
29
34
  urls << page_url
30
- WaybackArchiver.logger.info "Found: #{page_url}"
35
+ WaybackArchiver.logger.debug "Found: #{page_url}"
31
36
  yield(page_url) if block_given?
32
37
  end
33
38
  end
34
39
  urls
35
40
  end
36
-
37
- # Retrieve URLs listed in file.
38
- # @return [Array] of URLs defined in file.
39
- # @param [String] path to get URLs from.
40
- # @example Get URLs defined in /path/to/file
41
- # UrlCollector.file('/path/to/file')
42
- def self.file(path)
43
- raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
44
- urls = []
45
- File.open(path).read
46
- .gsub(/\r\n?/, "\n")
47
- .each_line { |line| urls << line.delete("\n").strip }
48
- urls.reject(&:empty?)
49
- end
50
41
  end
51
42
  end
@@ -1,4 +1,4 @@
1
1
  module WaybackArchiver
2
2
  # Gem version
3
- VERSION = '0.2.0'.freeze
3
+ VERSION = '1.0.0'.freeze
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-01 00:00:00.000000000 Z
11
+ date: 2017-08-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -38,20 +38,6 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0.1'
41
- - !ruby/object:Gem::Dependency
42
- name: url_resolver
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - "~>"
46
- - !ruby/object:Gem::Version
47
- version: '0.1'
48
- type: :runtime
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - "~>"
53
- - !ruby/object:Gem::Version
54
- version: '0.1'
55
41
  - !ruby/object:Gem::Dependency
56
42
  name: concurrent-ruby
57
43
  requirement: !ruby/object:Gem::Requirement
@@ -128,14 +114,14 @@ dependencies:
128
114
  requirements:
129
115
  - - "~>"
130
116
  - !ruby/object:Gem::Version
131
- version: '0.7'
117
+ version: '0.8'
132
118
  type: :development
133
119
  prerelease: false
134
120
  version_requirements: !ruby/object:Gem::Requirement
135
121
  requirements:
136
122
  - - "~>"
137
123
  - !ruby/object:Gem::Version
138
- version: '0.7'
124
+ version: '0.8'
139
125
  - !ruby/object:Gem::Dependency
140
126
  name: redcarpet
141
127
  requirement: !ruby/object:Gem::Requirement
@@ -150,6 +136,20 @@ dependencies:
150
136
  - - "~>"
151
137
  - !ruby/object:Gem::Version
152
138
  version: '3.2'
139
+ - !ruby/object:Gem::Dependency
140
+ name: webmock
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '3.0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '3.0'
153
153
  - !ruby/object:Gem::Dependency
154
154
  name: byebug
155
155
  requirement: !ruby/object:Gem::Requirement
@@ -164,8 +164,8 @@ dependencies:
164
164
  - - ">"
165
165
  - !ruby/object:Gem::Version
166
166
  version: '0'
167
- description: Send URLs to Wayback Machine (Internet Archive). By crawling, sitemap,
168
- file or single URL.
167
+ description: Post URLs to Wayback Machine (Internet Archive), using a crawler, from
168
+ Sitemap(s) or a list of URLs.
169
169
  email:
170
170
  - burenstam@gmail.com
171
171
  executables:
@@ -176,8 +176,12 @@ files:
176
176
  - bin/wayback_archiver
177
177
  - lib/wayback_archiver.rb
178
178
  - lib/wayback_archiver/archive.rb
179
+ - lib/wayback_archiver/http_code.rb
179
180
  - lib/wayback_archiver/null_logger.rb
180
181
  - lib/wayback_archiver/request.rb
182
+ - lib/wayback_archiver/sitemap.rb
183
+ - lib/wayback_archiver/sitemapper.rb
184
+ - lib/wayback_archiver/thread_pool.rb
181
185
  - lib/wayback_archiver/url_collector.rb
182
186
  - lib/wayback_archiver/version.rb
183
187
  homepage: https://github.com/buren/wayback_archiver
@@ -203,5 +207,5 @@ rubyforge_project:
203
207
  rubygems_version: 2.6.11
204
208
  signing_key:
205
209
  specification_version: 4
206
- summary: Send URLs to Wayback Machine (Internet Archive)
210
+ summary: Post URLs to Wayback Machine (Internet Archive)
207
211
  test_files: []