wayback_archiver 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4dba94d5ac29b57a1df0a6e98f5d9ea29c1905b9ffdef7ee9bf9c344448362fe
4
- data.tar.gz: 79af5b2af8660dd5f0cafae521e7b73be36e528da7c8ce2f62502f76b434959b
3
+ metadata.gz: e5f39f42fe6d5a4f6fbded5ab460cd91a2c9411af28645a41b987bf01a31ea14
4
+ data.tar.gz: 8cc1f5dbdc7d55fb9a1ec358c354fd8db3bacd856a824e9090005445a966de39
5
5
  SHA512:
6
- metadata.gz: d02a18809ebdc880bdcdb9a7d0d522da445c2a2329959080c897d1f7033f6a687c268a626abfa05a45791c5e1404a4e3e8025ea172494bca0b430033e4fc20b0
7
- data.tar.gz: 316e8fe4e922b0aeb5047191fa6ddd8e242fa7bfd1ce8eb2b2a2e08d980999dd895c071f82f949a2690c4259bc5b3de00049e691b540040db79d002ffdf8a4f0
6
+ metadata.gz: 6e3edf351b7cda562d39120df2dff564a8248c41ecedae54deed2017c11cbc6c56311507a7dda87ce350b17ac9bdf3d265523dde3a79de751426d206a2f51741
7
+ data.tar.gz: 020cb49d6dfc204de93a035853a26448b20901dc962d772554c4cb9074ff14682c570afffe3c2cbc05984c2298a7a05ffe662f8aaccde80e510b97450dd270cc
data/bin/wayback_archiver CHANGED
@@ -10,6 +10,7 @@ log = STDOUT
10
10
  log_level = Logger::INFO
11
11
  concurrency = WaybackArchiver.concurrency
12
12
  limit = WaybackArchiver.max_limit
13
+ hosts = []
13
14
 
14
15
  optparse = OptionParser.new do |parser|
15
16
  parser.banner = 'Usage: wayback_archiver [<url>] [options]'
@@ -30,7 +31,11 @@ optparse = OptionParser.new do |parser|
30
31
  strategy = 'urls'
31
32
  end
32
33
 
33
- parser.on('--concurrency=5', Integer, 'Concurrency') do |value|
34
+ parser.on('--hosts=[example.com]', Array, 'Only spider links on certain hosts') do |value|
35
+ hosts = value.map { |v| Regexp.new(v) } if value
36
+ end
37
+
38
+ parser.on('--concurrency=1', Integer, 'Concurrency') do |value|
34
39
  concurrency = value
35
40
  end
36
41
 
@@ -81,6 +86,7 @@ strategy ||= 'auto'
81
86
  urls.each do |url|
82
87
  WaybackArchiver.archive(
83
88
  url,
89
+ hosts: hosts,
84
90
  strategy: strategy,
85
91
  concurrency: concurrency,
86
92
  limit: limit
@@ -11,9 +11,11 @@ module WaybackArchiver
11
11
  INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'.freeze
12
12
  # WaybackArchiver User-Agent
13
13
  USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze
14
+ # Default for whether to respect robots txt files
15
+ DEFAULT_RESPECT_ROBOTS_TXT = false
14
16
 
15
17
  # Default concurrency for archiving URLs
16
- DEFAULT_CONCURRENCY = 5
18
+ DEFAULT_CONCURRENCY = 1
17
19
 
18
20
  # Maxmium number of links posted (-1 is no limit)
19
21
  DEFAULT_MAX_LIMIT = -1
@@ -22,6 +24,7 @@ module WaybackArchiver
22
24
  # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
23
25
  # @param [String/Array<String>] source for URL(s).
24
26
  # @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto.
27
+ # @param [Array<String, Regexp>] hosts to crawl.
25
28
  # @example Crawl example.com and send all URLs of the same domain
26
29
  # WaybackArchiver.archive('example.com') # Default strategy is :auto
27
30
  # WaybackArchiver.archive('example.com', strategy: :auto)
@@ -43,11 +46,19 @@ module WaybackArchiver
43
46
  # WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
44
47
  # WaybackArchiver.archive('example.com', strategy: :url, limit: 100) # send max 100 URLs
45
48
  # WaybackArchiver.archive('example.com', :url)
46
- def self.archive(source, legacy_strategy = nil, strategy: :auto, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
49
+ # @example Crawl multiple hosts
50
+ # WaybackArchiver.archive(
51
+ # 'http://example.com',
52
+ # hosts: [
53
+ # 'example.com',
54
+ # /host[\d]+\.example\.com/
55
+ # ]
56
+ # )
57
+ def self.archive(source, legacy_strategy = nil, strategy: :auto, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
47
58
  strategy = legacy_strategy || strategy
48
59
 
49
60
  case strategy.to_s
50
- when 'crawl' then crawl(source, concurrency: concurrency, limit: limit, &block)
61
+ when 'crawl' then crawl(source, concurrency: concurrency, limit: limit, hosts: hosts, &block)
51
62
  when 'auto' then auto(source, concurrency: concurrency, limit: limit, &block)
52
63
  when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit, &block)
53
64
  when 'urls' then urls(source, concurrency: concurrency, limit: limit, &block)
@@ -63,7 +74,7 @@ module WaybackArchiver
63
74
  # @param [String] source (must be a valid URL).
64
75
  # @param concurrency [Integer]
65
76
  # @example Auto archive example.com
66
- # WaybackArchiver.auto('example.com') # Default concurrency is 5
77
+ # WaybackArchiver.auto('example.com') # Default concurrency is 1
67
78
  # @example Auto archive example.com with low concurrency
68
79
  # WaybackArchiver.auto('example.com', concurrency: 1)
69
80
  # @example Auto archive example.com and archive max 100 URLs
@@ -79,16 +90,25 @@ module WaybackArchiver
79
90
  # Crawl site for URLs to send to the Wayback Machine.
80
91
  # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
81
92
  # @param [String] url to start crawling from.
93
+ # @param [Array<String, Regexp>] hosts to crawl
82
94
  # @param concurrency [Integer]
83
95
  # @example Crawl example.com and send all URLs of the same domain
84
- # WaybackArchiver.crawl('example.com') # Default concurrency is 5
96
+ # WaybackArchiver.crawl('example.com') # Default concurrency is 1
85
97
  # @example Crawl example.com and send all URLs of the same domain with low concurrency
86
98
  # WaybackArchiver.crawl('example.com', concurrency: 1)
87
99
  # @example Crawl example.com and archive max 100 URLs
88
100
  # WaybackArchiver.crawl('example.com', limit: 100)
89
- def self.crawl(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
101
+ # @example Crawl multiple hosts
102
+ # URLCollector.crawl(
103
+ # 'http://example.com',
104
+ # hosts: [
105
+ # 'example.com',
106
+ # /host[\d]+\.example\.com/
107
+ # ]
108
+ # )
109
+ def self.crawl(url, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
90
110
  WaybackArchiver.logger.info "Crawling #{url}"
91
- Archive.crawl(url, concurrency: concurrency, limit: limit, &block)
111
+ Archive.crawl(url, hosts: hosts, concurrency: concurrency, limit: limit, &block)
92
112
  end
93
113
 
94
114
  # Get URLs from sitemap and send found URLs to the Wayback Machine.
@@ -96,7 +116,7 @@ module WaybackArchiver
96
116
  # @param [String] url to the sitemap.
97
117
  # @param concurrency [Integer]
98
118
  # @example Get example.com sitemap and archive all found URLs
99
- # WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 5
119
+ # WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 1
100
120
  # @example Get example.com sitemap and archive all found URLs with low concurrency
101
121
  # WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1)
102
122
  # @example Get example.com sitemap archive max 100 URLs
@@ -155,6 +175,19 @@ module WaybackArchiver
155
175
  @user_agent ||= USER_AGENT
156
176
  end
157
177
 
178
+ # Sets the default respect_robots_txt
179
+ # @return [Boolean] the desired default for respect_robots_txt
180
+ # @param [Boolean] respect_robots_txt the desired default
181
+ def self.respect_robots_txt=(respect_robots_txt)
182
+ @respect_robots_txt = respect_robots_txt
183
+ end
184
+
185
+ # Returns the default respect_robots_txt
186
+ # @return [Boolean] the configured or the default respect_robots_txt
187
+ def self.respect_robots_txt
188
+ @respect_robots_txt ||= DEFAULT_RESPECT_ROBOTS_TXT
189
+ end
190
+
158
191
  # Sets the default concurrency
159
192
  # @return [Integer] the desired default concurrency
160
193
  # @param [Integer] concurrency the desired default concurrency
@@ -9,7 +9,7 @@ module WaybackArchiver
9
9
  # Send URLs to Wayback Machine.
10
10
  # @return [Array<ArchiveResult>] with sent URLs.
11
11
  # @param [Array<String>] urls to send to the Wayback Machine.
12
- # @param concurrency [Integer] the default is 5
12
+ # @param concurrency [Integer] the default is 1
13
13
  # @yield [archive_result] If a block is given, each result will be yielded
14
14
  # @yieldparam [ArchiveResult] archive_result
15
15
  # @example Archive urls, asynchronously
@@ -54,7 +54,8 @@ module WaybackArchiver
54
54
  # Send URLs to Wayback Machine by crawling the site.
55
55
  # @return [Array<ArchiveResult>] with URLs sent to the Wayback Machine.
56
56
  # @param [String] source for URL to crawl.
57
- # @param concurrency [Integer] the default is 5
57
+ # @param concurrency [Integer] the default is 1
58
+ # @param [Array<String, Regexp>] hosts to crawl
58
59
  # @yield [archive_result] If a block is given, each result will be yielded
59
60
  # @yieldparam [ArchiveResult] archive_result
60
61
  # @example Crawl example.com and send all URLs of the same domain
@@ -66,13 +67,21 @@ module WaybackArchiver
66
67
  # Archiver.crawl('example.com', concurrency: 1)
67
68
  # @example Stop after archiving 100 links
68
69
  # Archiver.crawl('example.com', limit: 100)
69
- def self.crawl(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
70
+ # @example Crawl multiple hosts
71
+ # URLCollector.crawl(
72
+ # 'http://example.com',
73
+ # hosts: [
74
+ # 'example.com',
75
+ # /host[\d]+\.example\.com/
76
+ # ]
77
+ # )
78
+ def self.crawl(source, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
70
79
  WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
71
80
 
72
81
  posted_urls = Concurrent::Array.new
73
82
  pool = ThreadPool.build(concurrency)
74
83
 
75
- found_urls = URLCollector.crawl(source, limit: limit) do |url|
84
+ found_urls = URLCollector.crawl(source, hosts: hosts, limit: limit) do |url|
76
85
  pool.post do
77
86
  result = post_url(url)
78
87
  yield(result) if block_given?
@@ -19,17 +19,27 @@ module WaybackArchiver
19
19
  # Retrieve URLs by crawling.
20
20
  # @return [Array<String>] of URLs defined found during crawl.
21
21
  # @param [String] url domain to crawl URLs from.
22
+ # @param [Array<String, Regexp>] hosts to crawl.
22
23
  # @example Crawl URLs defined on example.com
23
24
  # URLCollector.crawl('http://example.com')
24
25
  # @example Crawl URLs defined on example.com and limit the number of visited pages to 100
25
26
  # URLCollector.crawl('http://example.com', limit: 100)
26
27
  # @example Crawl URLs defined on example.com and explicitly set no upper limit on the number of visited pages to 100
27
28
  # URLCollector.crawl('http://example.com', limit: -1)
28
- def self.crawl(url, limit: WaybackArchiver.max_limit)
29
+ # @example Crawl multiple hosts
30
+ # URLCollector.crawl(
31
+ # 'http://example.com',
32
+ # hosts: [
33
+ # 'example.com',
34
+ # /host[\d]+\.example\.com/
35
+ # ]
36
+ # )
37
+ def self.crawl(url, hosts: [], limit: WaybackArchiver.max_limit)
29
38
  urls = []
30
39
  start_at_url = Request.build_uri(url).to_s
31
40
  options = {
32
- robots: true,
41
+ robots: WaybackArchiver.respect_robots_txt,
42
+ hosts: hosts,
33
43
  user_agent: WaybackArchiver.user_agent
34
44
  }
35
45
  options[:limit] = limit unless limit == -1
@@ -1,4 +1,4 @@
1
1
  module WaybackArchiver
2
2
  # Gem version
3
- VERSION = '1.3.0'.freeze
3
+ VERSION = '1.4.0'.freeze
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_archiver
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-01-24 00:00:00.000000000 Z
11
+ date: 2021-04-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: spidr
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 0.6.0
19
+ version: 0.6.1
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 0.6.0
26
+ version: 0.6.1
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: robots
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -58,28 +58,28 @@ dependencies:
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '1.3'
61
+ version: '2.1'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '1.3'
68
+ version: '2.1'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rake
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '10.3'
75
+ version: '12.3'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '10.3'
82
+ version: '12.3'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: rspec
85
85
  requirement: !ruby/object:Gem::Requirement
@@ -220,8 +220,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
220
220
  - !ruby/object:Gem::Version
221
221
  version: '0'
222
222
  requirements: []
223
- rubyforge_project:
224
- rubygems_version: 2.7.6
223
+ rubygems_version: 3.1.4
225
224
  signing_key:
226
225
  specification_version: 4
227
226
  summary: Post URLs to Wayback Machine (Internet Archive)