wayback_archiver 0.2.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ccebbb815d374658a9b2e1b2998a40041115d295
4
- data.tar.gz: 153afa895756670988fa3663f44fd1fd5a2e5e3e
3
+ metadata.gz: 1f9f979d5fa0d31cfdf61660baa3464bdb1425e5
4
+ data.tar.gz: 1d5701273bbe4d02b2ba5d88f9e75c9477058a28
5
5
  SHA512:
6
- metadata.gz: db7d655b1ea642618797d7fa3ece8357fa608dcb60322da76be7d793de11cd61df61f796025000ed0099edd96711aa78db607ce0614e9256e5921c252a7d931b
7
- data.tar.gz: 919d8473c7f97bbd36c9065b95a99304ace7e0556a8dcd0f92eedf00d65961400dd92c63d86e405aa97babd5d254b522384f665ff39ababdc71afed9c13333ad
6
+ metadata.gz: e69883c975584b3120993371b29a0c0b7a71f3fd4210764b4caa712d4071ec175dc47fa63217950968da62485e927c05fcc977dfb076a317be4754cf4f16ec90
7
+ data.tar.gz: 51f80b591f40f4bc22b5bf2e2b7d56273c3c495db43d8a9268f11048a410ad355a3ff238088a4abd83c69fd585db5e1e704d34d22dac3d4f4b9b2f73089458ac
data/bin/wayback_archiver CHANGED
@@ -1,14 +1,78 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ require 'optparse'
3
4
  require 'wayback_archiver'
4
5
 
5
- url = ARGV[0]
6
- from = ARGV[1]
6
+ # Default values
7
+ urls = nil
8
+ strategy = 'auto'
9
+ log = STDOUT
10
+ log_level = Logger::INFO
11
+ concurrency = WaybackArchiver.concurrency
7
12
 
8
- WaybackArchiver.logger = Logger.new(STDOUT)
13
+ optparse = OptionParser.new do |parser|
14
+ parser.banner = 'Usage: wayback_archiver [<url>] [options]'
9
15
 
10
- if from.nil?
11
- WaybackArchiver.archive(url)
12
- else
13
- WaybackArchiver.archive(url, from)
16
+ parser.on('--auto', 'Auto (default)') do |value|
17
+ strategy = 'auto'
18
+ end
19
+
20
+ parser.on('--crawl', 'Crawl') do |value|
21
+ strategy = 'crawl'
22
+ end
23
+
24
+ parser.on('--sitemap', 'Sitemap') do |value|
25
+ strategy = 'sitemap'
26
+ end
27
+
28
+ parser.on('--urls', '--url', 'URL(s)') do |value|
29
+ strategy = 'urls'
30
+ end
31
+
32
+ parser.on('--concurrency=5', Integer, 'Concurrency') do |value|
33
+ concurrency = value
34
+ end
35
+
36
+ parser.on('--log=output.log', String, 'Path to desired log file (if no argument is given it defaults to STDOUT)') do |path|
37
+ log = path
38
+ end
39
+
40
+ parser.on('--[no-]verbose', 'Verboes logs') do |value|
41
+ log_level = value ? Logger::DEBUG : Logger::WARN
42
+ end
43
+
44
+ parser.on('-h', '--help', 'How to use') do
45
+ puts parser
46
+ exit
47
+ end
48
+
49
+ # No argument, shows at tail. This will print an options summary.
50
+ parser.on_tail('-h', '--help', 'Show this message') do
51
+ puts parser
52
+ exit
53
+ end
54
+
55
+ parser.on_tail('--version', 'Show version') do
56
+ puts "WaybackArchiver version #{WaybackArchiver::VERSION}"
57
+ exit
58
+ end
59
+ end
60
+
61
+ optparse.parse!
62
+
63
+ urls = ARGV.map(&:strip).reject(&:empty?)
64
+ if urls.empty?
65
+ puts optparse.help
66
+ raise ArgumentError, "[<url>] is required"
67
+ end
68
+
69
+ WaybackArchiver.logger = Logger.new(log).tap do |logger|
70
+ logger.progname = 'WaybackArchiver'
71
+ logger.level = log_level
72
+ end
73
+
74
+ # If no strategy has explicitly been given, then default to 'auto'
75
+ strategy ||= 'auto'
76
+ urls.each do |url|
77
+ WaybackArchiver.archive(url, strategy: strategy, concurrency: concurrency)
14
78
  end
@@ -1,60 +1,155 @@
1
- require 'uri'
2
- require 'net/http'
3
-
4
- require 'concurrent'
5
-
1
+ require 'wayback_archiver/thread_pool'
6
2
  require 'wayback_archiver/null_logger'
7
3
  require 'wayback_archiver/version'
8
4
  require 'wayback_archiver/url_collector'
9
5
  require 'wayback_archiver/archive'
10
- require 'wayback_archiver/request'
6
+ require 'wayback_archiver/sitemapper'
11
7
 
12
- # WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap, file or single URL.
8
+ # WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap or by passing a list of URLs.
13
9
  module WaybackArchiver
14
10
  # Link to gem on rubygems.org, part of the sent User-Agent
15
11
  INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'.freeze
16
12
  # WaybackArchiver User-Agent
17
13
  USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze
18
14
 
15
+ # Default concurrency for archiving URLs
16
+ DEFAULT_CONCURRENCY = 5
17
+
19
18
  # Send URLs to Wayback Machine.
20
- # @return [Array] with URLs sent to the Wayback Machine.
21
- # @param [String] source for URL(s).
22
- # @param [String/Symbol] type of source. Supported types: ['crawl', 'sitemap', 'url', 'file'].
19
+ # @return [Array<String>] of URLs sent to the Wayback Machine.
20
+ # @param [String/Array<String>] source for URL(s).
21
+ # @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto.
23
22
  # @example Crawl example.com and send all URLs of the same domain
24
- # WaybackArchiver.archive('example.com') # Default type is :crawl
23
+ # WaybackArchiver.archive('example.com') # Default strategy is :auto
24
+ # WaybackArchiver.archive('example.com', strategy: :auto)
25
+ # WaybackArchiver.archive('example.com', strategy: :auto, concurrency: 10)
26
+ # WaybackArchiver.archive('example.com', :auto)
27
+ # @example Crawl example.com and send all URLs of the same domain
28
+ # WaybackArchiver.archive('example.com', strategy: :crawl)
29
+ # WaybackArchiver.archive('example.com', strategy: :crawl, concurrency: 10)
25
30
  # WaybackArchiver.archive('example.com', :crawl)
31
+ # @example Send example.com Sitemap URLs
32
+ # WaybackArchiver.archive('example.com', strategy: :sitemap)
33
+ # WaybackArchiver.archive('example.com', strategy: :sitemap, concurrency: 10)
34
+ # WaybackArchiver.archive('example.com', :sitemap)
26
35
  # @example Send only example.com
36
+ # WaybackArchiver.archive('example.com', strategy: :url)
37
+ # WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
27
38
  # WaybackArchiver.archive('example.com', :url)
28
- # @example Send URL on each line in specified file
29
- # WaybackArchiver.archive('/path/to/file', :file)
30
- def self.archive(source, type = :crawl)
31
- case type.to_s
32
- when 'file' then Archive.post(UrlCollector.file(source))
33
- when 'crawl' then crawl(source)
34
- when 'sitemap' then Archive.post(UrlCollector.sitemap(source))
35
- when 'url' then Archive.post_url(Request.resolve_url(source))
39
+ def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency)
40
+ strategy = legacy_strategy || strategy
41
+
42
+ case strategy.to_s
43
+ when 'crawl' then crawl(source, concurrency: concurrency)
44
+ when 'auto' then auto(source, concurrency: concurrency)
45
+ when 'sitemap' then sitemap(source, concurrency: concurrency)
46
+ when 'urls' then urls(source, concurrency: concurrency)
47
+ when 'url' then urls(source, concurrency: concurrency)
36
48
  else
37
- raise ArgumentError, "Unknown type: '#{type}'. Allowed types: sitemap, url, file, crawl"
49
+ raise ArgumentError, "Unknown strategy: '#{strategy}'. Allowed strategies: sitemap, urls, url, crawl"
38
50
  end
39
51
  end
40
52
 
53
+ # Look for Sitemap(s) and if nothing is found fallback to crawling.
54
+ # Then send found URLs to the Wayback Machine.
55
+ # @return [Array<String>] of URLs sent to the Wayback Machine.
56
+ # @param [String] source (must be a valid URL).
57
+ # @param concurrency [Integer]
58
+ # @example Auto archive example.com
59
+ # WaybackArchiver.auto('example.com') # Default concurrency is 5
60
+ # @example Auto archive example.com with low concurrency
61
+ # WaybackArchiver.auto('example.com', concurrency: 1)
62
+ # @see http://www.sitemaps.org
63
+ def self.auto(source, concurrency: WaybackArchiver.concurrency)
64
+ urls = Sitemapper.autodiscover(source)
65
+ return urls(urls, concurrency: concurrency) if urls.any?
66
+
67
+ crawl(source, concurrency: concurrency)
68
+ end
69
+
41
70
  # Crawl site for URLs to send to the Wayback Machine.
42
- # @return [Array] with URLs sent to the Wayback Machine.
43
- # @param [String] source for URL(s).
44
- # @param [Integer] concurrency.
71
+ # @return [Array<String>] of URLs sent to the Wayback Machine.
72
+ # @param [String] url to start crawling from.
73
+ # @param concurrency [Integer]
45
74
  # @example Crawl example.com and send all URLs of the same domain
46
75
  # WaybackArchiver.crawl('example.com') # Default concurrency is 5
47
76
  # @example Crawl example.com and send all URLs of the same domain with low concurrency
48
77
  # WaybackArchiver.crawl('example.com', concurrency: 1)
49
- def self.crawl(source, concurrency: Archive::DEFAULT_CONCURRENCY)
50
- Archive.crawl(source, concurrency: concurrency)
78
+ def self.crawl(url, concurrency: WaybackArchiver.concurrency)
79
+ WaybackArchiver.logger.info "Crawling #{url}"
80
+ Archive.crawl(url, concurrency: concurrency)
81
+ end
82
+
83
+ # Get URLs from sitemap and send found URLs to the Wayback Machine.
84
+ # @return [Array<String>] of URLs sent to the Wayback Machine.
85
+ # @param [String] url to the sitemap.
86
+ # @param concurrency [Integer]
87
+ # @example Get example.com sitemap and archive all found URLs
88
+ # WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 5
89
+ # @example Get example.com sitemap and archive all found URLs with low concurrency
90
+ # WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1)
91
+ # @see http://www.sitemaps.org
92
+ def self.sitemap(url, concurrency: WaybackArchiver.concurrency)
93
+ WaybackArchiver.logger.info "Fetching Sitemap"
94
+ Archive.post(URLCollector.sitemap(url), concurrency: concurrency)
95
+ end
96
+
97
+ # Send URL to the Wayback Machine.
98
+ # @return [Array<String>] of URLs sent to the Wayback Machine.
99
+ # @param [Array<String>/String] urls or url.
100
+ # @param concurrency [Integer]
101
+ # @example Archive example.com
102
+ # WaybackArchiver.urls('example.com')
103
+ # @example Archive example.com and google.com
104
+ # WaybackArchiver.urls(%w(example.com google.com))
105
+ def self.urls(urls, concurrency: WaybackArchiver.concurrency)
106
+ Archive.post(Array(urls), concurrency: concurrency)
51
107
  end
52
108
 
109
+ # Set logger
110
+ # @return [Object] the set logger
111
+ # @param [Object] logger an object than response to quacks like a Logger
112
+ # @example set a logger that prints to standard out (STDOUT)
113
+ # WaybackArchiver.logger = Logger.new(STDOUT)
53
114
  def self.logger=(logger)
54
115
  @logger = logger
55
116
  end
56
117
 
118
+ # Returns the current logger
119
+ # @return [Object] the current logger instance
57
120
  def self.logger
58
121
  @logger ||= NullLogger.new
59
122
  end
123
+
124
+ # Resets the logger to the default
125
+ # @return [NullLogger] a new instance of NullLogger
126
+ def self.default_logger!
127
+ @logger = NullLogger.new
128
+ end
129
+
130
+ # Sets the user agent
131
+ # @return [String] the configured user agent
132
+ # @param [String] user_agent the desired user agent
133
+ def self.user_agent=(user_agent)
134
+ @user_agent = user_agent
135
+ end
136
+
137
+ # Returns the configured user agent
138
+ # @return [String] the configured or the default user agent
139
+ def self.user_agent
140
+ @user_agent ||= USER_AGENT
141
+ end
142
+
143
+ # Sets the default concurrency
144
+ # @return [Integer] the desired default concurrency
145
+ # @param [Integer] concurrency the desired default concurrency
146
+ def self.concurrency=(concurrency)
147
+ @concurrency = concurrency
148
+ end
149
+
150
+ # Returns the default concurrency
151
+ # @return [Integer] the configured or the default concurrency
152
+ def self.concurrency
153
+ @concurrency ||= DEFAULT_CONCURRENCY
154
+ end
60
155
  end
@@ -1,46 +1,68 @@
1
+ require 'concurrent'
2
+
3
+ require 'wayback_archiver/thread_pool'
4
+ require 'wayback_archiver/request'
5
+
1
6
  module WaybackArchiver
2
7
  # Post URL(s) to Wayback Machine
3
8
  class Archive
4
9
  # Wayback Machine base URL.
5
10
  WAYBACK_BASE_URL = 'https://web.archive.org/save/'.freeze
6
- # Default concurrency for archiving URLs
7
- DEFAULT_CONCURRENCY = 5
11
+
8
12
  # Send URLs to Wayback Machine.
9
- # @return [Array] with sent URLs.
10
- # @param [Array] urls URLs to send.
11
- # @param [Hash] options
13
+ # @return [Array<String>] with sent URLs.
14
+ # @param [Array<String>] urls to send to the Wayback Machine.
15
+ # @param concurrency [Integer] the default is 5
12
16
  # @example Archive urls, asynchronously
13
17
  # Archive.post(['http://example.com'])
14
18
  # @example Archive urls, using only 1 thread
15
19
  # Archive.post(['http://example.com'], concurrency: 1)
16
- def self.post(urls, concurrency: DEFAULT_CONCURRENCY)
17
- WaybackArchiver.logger.info "=== WAYBACK ARCHIVER ==="
20
+ def self.post(urls, concurrency: WaybackArchiver.concurrency)
21
+ WaybackArchiver.logger.info "Total URLs to be sent: #{urls.length}"
18
22
  WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
19
- WaybackArchiver.logger.info "Total urls to be sent: #{urls.length}"
20
23
 
21
- pool = Concurrent::FixedThreadPool.new(concurrency)
24
+ posted_urls = Concurrent::Array.new
25
+ pool = ThreadPool.build(concurrency)
22
26
  urls.each do |url|
23
- pool.post { Archive.post_url(url) }
27
+ pool.post do
28
+ posted_url = post_url(url)
29
+ posted_urls << posted_url if posted_url
30
+ end
24
31
  end
25
32
 
26
- WaybackArchiver.logger.info "#{urls.length} URLs sent to Internet archive"
27
- urls
33
+ pool.shutdown
34
+ pool.wait_for_termination
35
+
36
+ WaybackArchiver.logger.info "#{posted_urls.length} URL(s) posted to Wayback Machine"
37
+ posted_urls
28
38
  end
29
39
 
30
40
  # Send URLs to Wayback Machine by crawling the site.
31
- # @return [Array] with URLs sent to the Wayback Machine.
41
+ # @return [Array<String>] with URLs sent to the Wayback Machine.
32
42
  # @param [String] source for URL to crawl.
33
- # @param [Integer] concurrency (default is 5).
43
+ # @param concurrency [Integer] the default is 5
34
44
  # @example Crawl example.com and send all URLs of the same domain
35
45
  # WaybackArchiver.crawl('example.com')
36
46
  # @example Crawl example.com and send all URLs of the same domain with low concurrency
37
47
  # WaybackArchiver.crawl('example.com', concurrency: 1)
38
- def self.crawl(source, concurrency: DEFAULT_CONCURRENCY)
39
- pool = Concurrent::FixedThreadPool.new(concurrency) # X threads
48
+ def self.crawl(source, concurrency: WaybackArchiver.concurrency)
49
+ WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
50
+
51
+ posted_urls = Concurrent::Array.new
52
+ pool = ThreadPool.build(concurrency)
40
53
 
41
- UrlCollector.crawl(source) do |url|
42
- pool.post { Archive.post_url(url) }
54
+ found_urls = URLCollector.crawl(source) do |url|
55
+ pool.post do
56
+ posted_url = post_url(url)
57
+ posted_urls << posted_url if posted_url
58
+ end
43
59
  end
60
+ WaybackArchiver.logger.info "Crawling of #{source} finished, found #{found_urls.length} URL(s)"
61
+ pool.shutdown
62
+ pool.wait_for_termination
63
+
64
+ WaybackArchiver.logger.info "#{posted_urls.length} URL(s) posted to Wayback Machine"
65
+ posted_urls
44
66
  end
45
67
 
46
68
  # Send URL to Wayback Machine.
@@ -50,12 +72,12 @@ module WaybackArchiver
50
72
  # Archive.post_url('http://example.com')
51
73
  def self.post_url(url)
52
74
  request_url = "#{WAYBACK_BASE_URL}#{url}"
53
- response = Request.response(request_url)
54
- WaybackArchiver.logger.info "[#{response.code}, #{response.message}] #{url}"
75
+ response = Request.get(request_url, follow_redirects: false)
76
+ WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
55
77
  url
56
- rescue Exception => e
57
- WaybackArchiver.logger.error "Error message: #{e.message}"
58
- WaybackArchiver.logger.error "Failed to archive: #{url}"
78
+ rescue Request::Error => e
79
+ WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
80
+ nil
59
81
  end
60
82
  end
61
83
  end
@@ -0,0 +1,49 @@
1
+ module WaybackArchiver
2
+ # Convience class for HTTP response codes
3
+ class HTTPCode
4
+ # Type of code as symbol
5
+ # @return [Symbol] code type
6
+ # @param [String/Integer] code the response code
7
+ # @example
8
+ # HttpCode.type('200')
9
+ def self.type(code)
10
+ code = code.to_s
11
+ return :success if success?(code)
12
+ return :redirect if redirect?(code)
13
+ return :error if error?(code)
14
+
15
+ :unknown
16
+ end
17
+
18
+ # Whether the code is a success type
19
+ # @return [Boolean] is success or not
20
+ # @param [String] code the response code
21
+ # @example
22
+ # HttpCode.success?('200') # => true
23
+ # @example
24
+ # HttpCode.success?(200) # => true
25
+ # @example
26
+ # HttpCode.success?(nil) # => false
27
+ def self.success?(code)
28
+ code.to_s.match?(/2\d\d/)
29
+ end
30
+
31
+ # Whether the code is a redirect type
32
+ # @return [Boolean] is redirect or not
33
+ # @param [String] code the response code
34
+ # @example
35
+ # HttpCode.redirect?('301')
36
+ def self.redirect?(code)
37
+ code.to_s.match?(/3\d\d/)
38
+ end
39
+
40
+ # Whether the code is a error type
41
+ # @return [Boolean] is error or not
42
+ # @param [String] code the response code
43
+ # @example
44
+ # HttpCode.error?('301')
45
+ def self.error?(code)
46
+ code.to_s.match?(/4\d\d/) || code.to_s.match?(/5\d\d/)
47
+ end
48
+ end
49
+ end
@@ -1,9 +1,12 @@
1
1
  require 'logger'
2
2
 
3
- class NullLogger < Logger
4
- def initialize(*args)
5
- end
3
+ module WaybackArchiver
4
+ # Don't log anyting / Send the logs to the abyss
5
+ class NullLogger < Logger
6
+ # Allow any and all params
7
+ def initialize(*args); end
6
8
 
7
- def add(*args, &block)
9
+ # Allow any and alls params and don't do anyting
10
+ def add(*args, &block); end
8
11
  end
9
12
  end
@@ -1,62 +1,223 @@
1
- require 'url_resolver' # TODO: Allow users to use any resolver
1
+ require 'net/http'
2
+ require 'openssl'
3
+ require 'timeout'
4
+ require 'uri'
5
+ require 'zlib'
6
+
7
+ require 'wayback_archiver/http_code'
2
8
 
3
9
  module WaybackArchiver
4
- # Request and parse HTML & XML documents
10
+ # Make HTTP requests
5
11
  class Request
6
- # Get and parse HTML & XML documents.
7
- # @return [Array] with links sent to the Wayback Machine.
8
- # @param [String] url to retrieve and parse.
9
- # @example Request and parse example.com
10
- # Request.document('example.com')
11
- # @example Request and parse google.com/sitemap.xml
12
- # Request.document('google.com/sitemap.xml')
13
- def self.document(url)
14
- response_body = Request.response(url).body
15
- Nokogiri::HTML(response_body)
12
+ # General error, something went wrong
13
+ class Error < StandardError; end
14
+ # Client error, something went wrong on the local machine
15
+ class ClientError < Error; end
16
+ # Server error, the remote server did something wrong
17
+ class ServerError < Error; end
18
+ # Remote server responded with a HTTP error
19
+ class HTTPError < ServerError; end
20
+ # Remote server error
21
+ class ResponseError < ServerError; end
22
+ # Max redirects reached error
23
+ class MaxRedirectError < ServerError; end
24
+ # Remote server responded with an invalid redirect
25
+ class InvalidRedirectError < ServerError; end
26
+ # Remote server responded with an unknown HTTP code
27
+ class UnknownResponseCodeError < ServerError; end
28
+
29
+ # GET response wrapper
30
+ GETStruct = Struct.new(:response, :error)
31
+
32
+ # Max number of redirects before an error is raised
33
+ MAX_REDIRECTS = 10
34
+
35
+ # Response data struct
36
+ Response = Struct.new(:code, :message, :body, :uri, :error)
37
+ class Response
38
+ # Returns true if a successfull response
39
+ # @example check if Response was successfull
40
+ # response = Response.new('200', 'OK', 'buren', 'http://example.com')
41
+ # response.success? # => true
42
+ def success?
43
+ HTTPCode.success?(code)
44
+ end
16
45
  end
17
46
 
18
47
  # Get reponse.
19
- # @return [Net::HTTP*] the http response.
20
- # @param [String] url URL to retrieve.
21
- # @param [Boolean] resolve whether to resolve the URL.
22
- # @example Resolve example.com and request
23
- # Request.response('example.com', true)
24
- # @example Request http://example.com
25
- # Request.response('http://example.com', false)
26
- def self.response(url, resolve = true)
27
- resolved_url = resolve ? resolve_url(url) : url
28
- uri = URI.parse(resolved_url)
29
- http = Net::HTTP.new(uri.host, uri.port)
30
- http.use_ssl = true if resolved_url.start_with?('https://')
31
-
32
- request = Net::HTTP::Get.new(uri.request_uri)
33
- request['User-Agent'] = WaybackArchiver::USER_AGENT
34
- http.request(request)
35
- end
36
-
37
- # Resolve the URL, follows redirects.
38
- # @return [String] the resolved URL.
39
- # @param [String] url to retrieve.
40
- # @example Resolve example.com and request
41
- # Request.resolve_url('example.com')
42
- def self.resolve_url(url)
43
- resolved = UrlResolver.resolve(url)
44
- resolved = resolved.prepend('http://') unless protocol?(resolved)
45
- resolved
46
- end
47
-
48
- # Resolve the URL, follows redirects.
49
- # @return [Boolean] true if string includes protocol.
50
- # @param [String] url to check.
51
- # @example Check if string includes protocol
52
- # Request.protocol?('example.com')
53
- # # => false
54
- # Request.protocol?('https://example.com')
55
- # # => true
56
- # Request.protocol?('http://example.com')
57
- # # => true
58
- def self.protocol?(url)
59
- url.start_with?('http://') || url.start_with?('https://')
48
+ # @return [Response] the http response representation.
49
+ # @param [String, URI] uri to retrieve.
50
+ # @param max_redirects [Integer] max redirects (default: 10).
51
+ # @param follow_redirects [Boolean] follow redirects (default: true).
52
+ # @example Get example.com
53
+ # Request.get('example.com')
54
+ # @example Get http://example.com and follow max 3 redirects
55
+ # Request.get('http://example.com', max_redirects: 3)
56
+ # @example Get http://example.com and don't follow redirects
57
+ # Request.get('http://example.com', follow_redirects: false)
58
+ # @raise [Error] super class of all exceptions that this method can raise
59
+ # @raise [ServerError] all server errors
60
+ # @raise [ClientError] all client errors
61
+ # @raise [HTTPError] all HTTP errors
62
+ # @raise [MaxRedirectError] too many redirects, subclass of HTTPError (only raised if raise_on_http_error flag is true)
63
+ # @raise [ResponseError] server responsed with a 4xx or 5xx HTTP status code, subclass of HTTPError (only raised if raise_on_http_error flag is true)
64
+ # @raise [UnknownResponseCodeError] server responded with an unknown HTTP status code, subclass of HTTPError (only raised if raise_on_http_error flag is true)
65
+ # @raise [InvalidRedirectError] server responded with an invalid redirect, subclass of HTTPError (only raised if raise_on_http_error flag is true)
66
+ def self.get(
67
+ uri,
68
+ max_redirects: MAX_REDIRECTS,
69
+ raise_on_http_error: false,
70
+ follow_redirects: true
71
+ )
72
+ uri = build_uri(uri)
73
+
74
+ redirect_count = 0
75
+ until redirect_count > max_redirects
76
+ WaybackArchiver.logger.debug "Requesting #{uri}"
77
+
78
+ http = Net::HTTP.new(uri.host, uri.port)
79
+ if uri.scheme == 'https'
80
+ http.use_ssl = true
81
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
82
+ end
83
+
84
+ request = Net::HTTP::Get.new(uri.request_uri)
85
+ request['User-Agent'] = WaybackArchiver.user_agent
86
+
87
+ result = perform_request(uri, http, request)
88
+ response = result.response
89
+ error = result.error
90
+
91
+ raise error if error
92
+
93
+ code = response.code
94
+ WaybackArchiver.logger.debug "[#{code}, #{response.message}] Requested #{uri}"
95
+
96
+ case HTTPCode.type(code)
97
+ when :success
98
+ return build_response(uri, response)
99
+ when :redirect
100
+ return build_response(uri, response) unless follow_redirects
101
+
102
+ uri = build_redirect_uri(uri, response)
103
+ redirect_count += 1
104
+ next
105
+ when :error
106
+ if raise_on_http_error
107
+ raise ResponseError, "Failed with response code: #{code} when requesting #{uri}"
108
+ end
109
+
110
+ return build_response(uri, response)
111
+ else
112
+ raise UnknownResponseCodeError, "Unknown HTTP response code #{code} when requesting #{uri}"
113
+ end
114
+ end
115
+
116
+ raise MaxRedirectError, "Redirected too many times when requesting #{uri}"
117
+ end
118
+
119
+ # Builds a Response object.
120
+ # @return [Response]
121
+ # @param [URI] uri that was requested.
122
+ # @param [Net::HTTPResponse] response the server response.
123
+ # @example Build Response object for example.com
124
+ # Request.build_response(uri, net_http_response)
125
+ def self.build_response(uri, response)
126
+ Response.new(
127
+ response.code,
128
+ response.message,
129
+ parse_body(response.body),
130
+ uri.to_s
131
+ )
132
+ end
133
+
134
+ # Builds an URI for a redirect response.
135
+ # @return [URI] to redirect to.
136
+ # @param [URI] uri that was requested.
137
+ # @param [Net::HTTPResponse] response the server response.
138
+ # @example Build redirect URI for example.com (lets pretend it will redirect..)
139
+ # Request.build_redirect_uri('http://example.com', net_http_response)
140
+ def self.build_redirect_uri(uri, response)
141
+ location_header = response.header.fetch('location') do
142
+ raise InvalidRedirectError, "No location header found on redirect when requesting #{uri}"
143
+ end
144
+
145
+ location = URI.parse(location_header)
146
+ return build_uri(uri) + location_header if location.relative?
147
+
148
+ location
149
+ end
150
+
151
+ # Build URI.
152
+ # @return [URI] uri to redirect to.
153
+ # @param [URI, String] uri to build.
154
+ # @example Build URI for example.com
155
+ # Request.build_uri('http://example.com')
156
+ # @example Build URI for #<URI::HTTP http://example.com>
157
+ # uri = URI.parse('http://example.com')
158
+ # Request.build_uri(uri)
159
+ def self.build_uri(uri)
160
+ return uri if uri.is_a?(URI)
161
+
162
+ uri = "http://#{uri}" unless uri =~ %r{^https?://}
163
+ URI.parse(uri)
164
+ end
165
+
166
+ # Parse response body, handles reqular and gzipped response bodies.
167
+ # @return [String] the response body.
168
+ # @param [String] response_body the server response body.
169
+ # @example Return response body for response.
170
+ # Request.parse_body(uri, net_http_response)
171
+ def self.parse_body(response_body)
172
+ return '' unless response_body
173
+
174
+ Zlib::GzipReader.new(StringIO.new(response_body)).read
175
+ rescue Zlib::GzipFile::Error => _e
176
+ response_body
177
+ end
178
+
179
+ # Return whether a value is blank or not.
180
+ # @return [Boolean] whether the value is blank or not.
181
+ # @param [Object] value the value to check if its blank or not.
182
+ # @example Returns false for nil.
183
+ # Request.blank?(nil)
184
+ # @example Returns false for empty string.
185
+ # Request.blank?('')
186
+ # @example Returns false for string with only spaces.
187
+ # Request.blank?(' ')
188
+ def self.blank?(value)
189
+ return true unless value
190
+ return true if value.strip.empty?
191
+
192
+ false
193
+ end
194
+
195
+ private
196
+
197
+ def self.perform_request(uri, http, request)
198
+ # TODO: Consider retrying failed requests
199
+ response = http.request(request)
200
+ GETStruct.new(response)
201
+ rescue Timeout::Error,
202
+ OpenSSL::SSL::SSLError,
203
+ Net::HTTPBadResponse,
204
+ Zlib::Error => e
205
+
206
+ build_request_error(uri, e, ServerError)
207
+ rescue SystemCallError,
208
+ SocketError,
209
+ IOError => e
210
+
211
+ build_request_error(uri, e, ClientError)
212
+ end
213
+
214
+ def self.build_request_error(uri, error, error_wrapper_klass)
215
+ WaybackArchiver.logger.error "Request to #{uri} failed: #{error_wrapper_klass}, #{error.class}, #{error.message}"
216
+
217
+ GETStruct.new(
218
+ Response.new,
219
+ error_wrapper_klass.new("#{error.class}, #{error.message}")
220
+ )
60
221
  end
61
222
  end
62
223
  end
@@ -0,0 +1,79 @@
1
+ require 'rexml/document'
2
+
3
+ module WaybackArchiver
4
+ # Parse Sitemaps, https://www.sitemaps.org
5
+ class Sitemap
6
+ attr_reader :document
7
+
8
+ def initialize(xml, strict: false)
9
+ @document = REXML::Document.new(xml)
10
+ rescue REXML::ParseException => _e
11
+ raise if strict
12
+
13
+ @document = REXML::Document.new('')
14
+ end
15
+
16
+ # Return all URLs defined in Sitemap.
17
+ # @return [Array<String>] of URLs defined in Sitemap.
18
+ # @example Get URLs defined in Sitemap
19
+ # sitemap = Sitemap.new(xml)
20
+ # sitemap.urls
21
+ def urls
22
+ @urls ||= extract_urls('url')
23
+ end
24
+
25
+ # Return all sitemap URLs defined in Sitemap.
26
+ # @return [Array<String>] of Sitemap URLs defined in Sitemap.
27
+ # @example Get Sitemap URLs defined in Sitemap
28
+ # sitemap = Sitemap.new(xml)
29
+ # sitemap.sitemaps
30
+ def sitemaps
31
+ @sitemaps ||= extract_urls('sitemap')
32
+ end
33
+
34
+ # Check if sitemap is a plain file
35
+ # @return [Boolean] whether document is plain
36
+ def plain_document?
37
+ document.elements.empty?
38
+ end
39
+
40
+ # Return the name of the document (if there is one)
41
+ # @return [String] the document root name
42
+ def root_name
43
+ return unless document.root
44
+
45
+ document.root.name
46
+ end
47
+
48
+ # Returns true of Sitemap is a Sitemap index
49
+ # @return [Boolean] of whether the Sitemap is an Sitemap index or not
50
+ # @example Check if Sitemap is a sitemap index
51
+ # sitemap = Sitemap.new(xml)
52
+ # sitemap.sitemap_index?
53
+ def sitemap_index?
54
+ root_name == 'sitemapindex'
55
+ end
56
+
57
+ # Returns true of Sitemap lists regular URLs
58
+ # @return [Boolean] of whether the Sitemap regular URL list
59
+ # @example Check if Sitemap is a regular URL list
60
+ # sitemap = Sitemap.new(xml)
61
+ # sitemap.urlset?
62
+ def urlset?
63
+ root_name == 'urlset'
64
+ end
65
+
66
+ private
67
+
68
+ # Extract URLs from Sitemap
69
+ def extract_urls(node_name)
70
+ return document.to_s.each_line.map(&:strip) if plain_document?
71
+
72
+ urls = []
73
+ document.root.elements.each("#{node_name}/loc") do |element|
74
+ urls << element.text
75
+ end
76
+ urls
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,75 @@
1
+ require 'robots'
2
+
3
+ require 'wayback_archiver/sitemap'
4
+ require 'wayback_archiver/request'
5
+
6
+ module WaybackArchiver
7
+ # Fetch and parse sitemaps recursively
8
+ class Sitemapper
9
+ # Common locations for Sitemap(s)
10
+ COMMON_SITEMAP_LOCATIONS = %w[
11
+ sitemap_index.xml.gz
12
+ sitemap-index.xml.gz
13
+ sitemap_index.xml
14
+ sitemap-index.xml
15
+ sitemap.xml.gz
16
+ sitemap.xml
17
+ ].freeze
18
+
19
+ # Autodiscover the location of the Sitemap, then fetch and parse recursively.
20
+ # First it tries /robots.txt, then common locations for Sitemap and finally the supplied URL.
21
+ # @return [Array<String>] of URLs defined in Sitemap(s).
22
+ # @param [URI] url to domain.
23
+ # @example Get URLs defined in Sitemap for google.com
24
+ # Sitemapper.autodiscover('https://google.com/')
25
+ # @see http://www.sitemaps.org
26
+ def self.autodiscover(url)
27
+ WaybackArchiver.logger.info 'Looking for Sitemap(s) in /robots.txt'
28
+ robots = Robots.new(WaybackArchiver.user_agent)
29
+ sitemaps = robots.other_values(url)['Sitemap']
30
+ if sitemaps
31
+ return sitemaps.flat_map do |sitemap|
32
+ WaybackArchiver.logger.info "Fetching Sitemap at #{sitemap}"
33
+ urls(url: sitemap)
34
+ end
35
+ end
36
+
37
+ COMMON_SITEMAP_LOCATIONS.each do |path|
38
+ WaybackArchiver.logger.info "Looking for Sitemap at #{path}"
39
+ sitemap_url = [url, path].join(url.end_with?('/') ? '' : '/')
40
+ response = Request.get(sitemap_url, raise_on_http_error: false)
41
+ return urls(xml: response.body) if response.success?
42
+ end
43
+
44
+ WaybackArchiver.logger.info "Looking for Sitemap at #{url}"
45
+ urls(url: url)
46
+ rescue Request::Error => e
47
+ WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
48
+ []
49
+ end
50
+
51
+ # Fetch and parse sitemaps recursively.
52
+ # @return [Array<String>] of URLs defined in Sitemap(s).
53
+ # @param url [String] URL to Sitemap.
54
+ # @param xml [String] Sitemap XML.
55
+ # @example Get URLs defined in Sitemap for google.com
56
+ # Sitemapper.urls(url: 'https://google.com/sitemap.xml')
57
+ # @example Get URLs defined in Sitemap
58
+ # Sitemapper.urls(xml: xml)
59
+ # @see http://www.sitemaps.org
60
+ def self.urls(url: nil, xml: nil)
61
+ xml = Request.get(url).body unless xml
62
+ sitemap = Sitemap.new(xml)
63
+
64
+ if sitemap.sitemap_index?
65
+ sitemap.sitemaps.flat_map { |sitemap_url| urls(url: sitemap_url) }
66
+ else
67
+ sitemap.urls
68
+ end
69
+ rescue Request::Error => e
70
+ WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
71
+
72
+ []
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,26 @@
1
+ require 'concurrent'
2
+
3
+ module WaybackArchiver
4
+ # Thread pool
5
+ class ThreadPool
6
+ # Build a thread pool
7
+ # @return [Concurrent::FixedThreadPool/Concurrent::ImmediateExecutor] an instance of a concurrent thread pool
8
+ # @param [Integer] concurrency the desired concurrency
9
+ # @example Build a thread pool with 10 as the desired concurrency
10
+ # pool = ThreadPool.build(10)
11
+ # pool.post { some_work } # Returns a Concurrent::FixedThreadPool
12
+ # @example Build a thread pool with 1 as the desired concurrency
13
+ # pool = ThreadPool.build(1)
14
+ # pool.post { some_work } # Returns a Concurrent::ImmediateExecutor
15
+ # @see https://github.com/ruby-concurrency/concurrent-ruby/blob/master/doc/thread_pools.md
16
+ def self.build(concurrency)
17
+ if concurrency == 1
18
+ Concurrent::ImmediateExecutor.new
19
+ elsif concurrency > 1
20
+ Concurrent::FixedThreadPool.new(concurrency)
21
+ else
22
+ raise ArgumentError, 'concurrency must be one or greater'
23
+ end
24
+ end
25
+ end
26
+ end
@@ -1,51 +1,42 @@
1
1
  require 'spidr'
2
2
  require 'robots'
3
3
 
4
+ require 'wayback_archiver/sitemapper'
5
+ require 'wayback_archiver/request'
6
+
4
7
  module WaybackArchiver
5
8
  # Retrive URLs from different sources
6
- class UrlCollector
9
+ class URLCollector
7
10
  # Retrieve URLs from Sitemap.
8
- # @return [Array] of URLs defined in Sitemap.
11
+ # @return [Array<String>] of URLs defined in Sitemap.
9
12
  # @param [String] url domain to retrieve Sitemap from.
10
13
  # @example Get URLs defined in Sitemap for google.com
11
- # UrlCollector.sitemap('https://google.com')
14
+ # URLCollector.sitemap('https://google.com/sitemap.xml')
12
15
  def self.sitemap(url)
13
- resolved = Request.resolve_url("#{url}/sitemap.xml")
14
- sitemap = Request.document(resolved)
15
- sitemap.css('loc').map(&:text)
16
+ Sitemapper.urls(url: Request.build_uri(url))
16
17
  end
17
18
 
18
19
  # Retrieve URLs by crawling.
19
- # @return [Array] of URLs defined found during crawl.
20
+ # @return [Array<String>] of URLs defined found during crawl.
20
21
  # @param [String] url domain to crawl URLs from.
21
22
  # @example Crawl URLs defined on example.com
22
- # UrlCollector.crawl('http://example.com')
23
+ # URLCollector.crawl('http://example.com')
23
24
  def self.crawl(url)
24
25
  urls = []
25
- resolved_url = Request.resolve_url(url)
26
- Spidr.site(resolved_url, robots: true) do |spider|
26
+ start_at_url = Request.build_uri(url).to_s
27
+ options = {
28
+ robots: true,
29
+ user_agent: WaybackArchiver.user_agent
30
+ }
31
+ Spidr.site(start_at_url, options) do |spider|
27
32
  spider.every_html_page do |page|
28
33
  page_url = page.url.to_s
29
34
  urls << page_url
30
- WaybackArchiver.logger.info "Found: #{page_url}"
35
+ WaybackArchiver.logger.debug "Found: #{page_url}"
31
36
  yield(page_url) if block_given?
32
37
  end
33
38
  end
34
39
  urls
35
40
  end
36
-
37
- # Retrieve URLs listed in file.
38
- # @return [Array] of URLs defined in file.
39
- # @param [String] path to get URLs from.
40
- # @example Get URLs defined in /path/to/file
41
- # UrlCollector.file('/path/to/file')
42
- def self.file(path)
43
- raise ArgumentError, "No such file: #{path}" unless File.exist?(path)
44
- urls = []
45
- File.open(path).read
46
- .gsub(/\r\n?/, "\n")
47
- .each_line { |line| urls << line.delete("\n").strip }
48
- urls.reject(&:empty?)
49
- end
50
41
  end
51
42
  end
@@ -1,4 +1,4 @@
1
1
  module WaybackArchiver
2
2
  # Gem version
3
- VERSION = '0.2.0'.freeze
3
+ VERSION = '1.0.0'.freeze
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-01 00:00:00.000000000 Z
11
+ date: 2017-08-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -38,20 +38,6 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0.1'
41
- - !ruby/object:Gem::Dependency
42
- name: url_resolver
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - "~>"
46
- - !ruby/object:Gem::Version
47
- version: '0.1'
48
- type: :runtime
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - "~>"
53
- - !ruby/object:Gem::Version
54
- version: '0.1'
55
41
  - !ruby/object:Gem::Dependency
56
42
  name: concurrent-ruby
57
43
  requirement: !ruby/object:Gem::Requirement
@@ -128,14 +114,14 @@ dependencies:
128
114
  requirements:
129
115
  - - "~>"
130
116
  - !ruby/object:Gem::Version
131
- version: '0.7'
117
+ version: '0.8'
132
118
  type: :development
133
119
  prerelease: false
134
120
  version_requirements: !ruby/object:Gem::Requirement
135
121
  requirements:
136
122
  - - "~>"
137
123
  - !ruby/object:Gem::Version
138
- version: '0.7'
124
+ version: '0.8'
139
125
  - !ruby/object:Gem::Dependency
140
126
  name: redcarpet
141
127
  requirement: !ruby/object:Gem::Requirement
@@ -150,6 +136,20 @@ dependencies:
150
136
  - - "~>"
151
137
  - !ruby/object:Gem::Version
152
138
  version: '3.2'
139
+ - !ruby/object:Gem::Dependency
140
+ name: webmock
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '3.0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '3.0'
153
153
  - !ruby/object:Gem::Dependency
154
154
  name: byebug
155
155
  requirement: !ruby/object:Gem::Requirement
@@ -164,8 +164,8 @@ dependencies:
164
164
  - - ">"
165
165
  - !ruby/object:Gem::Version
166
166
  version: '0'
167
- description: Send URLs to Wayback Machine (Internet Archive). By crawling, sitemap,
168
- file or single URL.
167
+ description: Post URLs to Wayback Machine (Internet Archive), using a crawler, from
168
+ Sitemap(s) or a list of URLs.
169
169
  email:
170
170
  - burenstam@gmail.com
171
171
  executables:
@@ -176,8 +176,12 @@ files:
176
176
  - bin/wayback_archiver
177
177
  - lib/wayback_archiver.rb
178
178
  - lib/wayback_archiver/archive.rb
179
+ - lib/wayback_archiver/http_code.rb
179
180
  - lib/wayback_archiver/null_logger.rb
180
181
  - lib/wayback_archiver/request.rb
182
+ - lib/wayback_archiver/sitemap.rb
183
+ - lib/wayback_archiver/sitemapper.rb
184
+ - lib/wayback_archiver/thread_pool.rb
181
185
  - lib/wayback_archiver/url_collector.rb
182
186
  - lib/wayback_archiver/version.rb
183
187
  homepage: https://github.com/buren/wayback_archiver
@@ -203,5 +207,5 @@ rubyforge_project:
203
207
  rubygems_version: 2.6.11
204
208
  signing_key:
205
209
  specification_version: 4
206
- summary: Send URLs to Wayback Machine (Internet Archive)
210
+ summary: Post URLs to Wayback Machine (Internet Archive)
207
211
  test_files: []