archaeo 0.2.7 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/archaeo/archive_health_check.rb +77 -0
- data/lib/archaeo/bulk_downloader.rb +82 -24
- data/lib/archaeo/cdx_api.rb +39 -7
- data/lib/archaeo/cdx_cache.rb +105 -0
- data/lib/archaeo/cli.rb +109 -8
- data/lib/archaeo/download_state.rb +35 -0
- data/lib/archaeo/encoding_detector.rb +91 -0
- data/lib/archaeo/page.rb +1 -1
- data/lib/archaeo/path_sanitizer.rb +152 -0
- data/lib/archaeo/pattern_filter.rb +80 -0
- data/lib/archaeo/rate_limiter.rb +86 -0
- data/lib/archaeo/save_api.rb +7 -2
- data/lib/archaeo/save_result.rb +26 -8
- data/lib/archaeo/subdomain_discovery.rb +117 -0
- data/lib/archaeo/url_rewriter.rb +64 -7
- data/lib/archaeo/version.rb +1 -1
- data/lib/archaeo.rb +7 -0
- metadata +9 -2
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Detects and transcodes content from legacy encodings to UTF-8.
|
|
5
|
+
#
|
|
6
|
+
# Tries a configurable list of encodings in priority order,
|
|
7
|
+
# returning the first that produces valid output. Used as a
|
|
8
|
+
# fallback when Content-Type charset and HTML meta charset are
|
|
9
|
+
# both absent.
|
|
10
|
+
class EncodingDetector
|
|
11
|
+
DEFAULT_ENCODINGS = [
|
|
12
|
+
Encoding::UTF_8,
|
|
13
|
+
Encoding::Windows_1251,
|
|
14
|
+
Encoding::GB18030,
|
|
15
|
+
Encoding::Shift_JIS,
|
|
16
|
+
Encoding::EUC_KR,
|
|
17
|
+
Encoding::ISO_8859_1,
|
|
18
|
+
Encoding::Windows_1252,
|
|
19
|
+
].freeze
|
|
20
|
+
|
|
21
|
+
BINARY_THRESHOLD = 0.1
|
|
22
|
+
TEXT_CONTROL_BYTES = [0x09, 0x0A, 0x0D].freeze
|
|
23
|
+
|
|
24
|
+
def initialize(encodings: DEFAULT_ENCODINGS)
|
|
25
|
+
@encodings = encodings
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def detect(bytes)
|
|
29
|
+
return Encoding::UTF_8 if bytes.nil? || bytes.empty?
|
|
30
|
+
|
|
31
|
+
string = bytes_to_string(bytes)
|
|
32
|
+
|
|
33
|
+
@encodings.each do |enc|
|
|
34
|
+
return enc if valid_in_encoding?(string, enc)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
Encoding::UTF_8
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def transcode(bytes, fallback: Encoding::UTF_8)
|
|
41
|
+
return "" if bytes.nil? || bytes.empty?
|
|
42
|
+
|
|
43
|
+
string = bytes.is_a?(String) ? bytes.dup : bytes.to_s
|
|
44
|
+
return string if string.encoding == Encoding::UTF_8 && string.valid_encoding?
|
|
45
|
+
|
|
46
|
+
binary = bytes_to_string(bytes)
|
|
47
|
+
detected = detect(bytes)
|
|
48
|
+
return binary.force_encoding(Encoding::UTF_8) if detected == Encoding::UTF_8
|
|
49
|
+
|
|
50
|
+
encode_to_utf8(binary, detected, fallback)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def binary?(bytes)
|
|
54
|
+
return false if bytes.nil? || bytes.empty?
|
|
55
|
+
|
|
56
|
+
sample = bytes.byteslice(0, [bytes.bytesize, 4096].min)
|
|
57
|
+
non_printable = sample.bytes.count do |b|
|
|
58
|
+
b < 0x20 && !TEXT_CONTROL_BYTES.include?(b)
|
|
59
|
+
end
|
|
60
|
+
non_printable.to_f / sample.bytesize > BINARY_THRESHOLD
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
private
|
|
64
|
+
|
|
65
|
+
def bytes_to_string(bytes)
|
|
66
|
+
case bytes
|
|
67
|
+
when String then bytes.dup.force_encoding(Encoding::ASCII_8BIT)
|
|
68
|
+
else bytes.to_s.force_encoding(Encoding::ASCII_8BIT)
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def valid_in_encoding?(string, encoding)
|
|
73
|
+
candidate = string.dup.force_encoding(encoding)
|
|
74
|
+
candidate.valid_encoding?
|
|
75
|
+
rescue StandardError
|
|
76
|
+
false
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def encode_to_utf8(string, source_encoding, fallback)
|
|
80
|
+
candidate = string.dup.force_encoding(source_encoding)
|
|
81
|
+
candidate.encode(Encoding::UTF_8,
|
|
82
|
+
invalid: :replace, undef: :replace,
|
|
83
|
+
replace: "?")
|
|
84
|
+
rescue StandardError
|
|
85
|
+
string.dup.force_encoding(fallback)
|
|
86
|
+
.encode(Encoding::UTF_8,
|
|
87
|
+
invalid: :replace, undef: :replace,
|
|
88
|
+
replace: "?")
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
data/lib/archaeo/page.rb
CHANGED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "digest"
|
|
4
|
+
require "uri"
|
|
5
|
+
|
|
6
|
+
module Archaeo
|
|
7
|
+
# Sanitizes URLs into safe filesystem paths.
|
|
8
|
+
#
|
|
9
|
+
# Handles recursive percent-decoding, query string hashing,
|
|
10
|
+
# segment truncation, and invalid character replacement.
|
|
11
|
+
class PathSanitizer
|
|
12
|
+
DEFAULT_MAX_SEGMENT = 200
|
|
13
|
+
HASH_LENGTH = 8
|
|
14
|
+
MAX_DECODE_ITERATIONS = 5
|
|
15
|
+
|
|
16
|
+
INVALID_CHARS = /[<>:"|?*#]/
|
|
17
|
+
SEPARATOR_RE = %r{[/\\]}
|
|
18
|
+
|
|
19
|
+
attr_reader :max_segment_length
|
|
20
|
+
|
|
21
|
+
def initialize(max_segment_length: DEFAULT_MAX_SEGMENT)
|
|
22
|
+
@max_segment_length = max_segment_length
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def sanitize(url)
|
|
26
|
+
path = strip_scheme(url)
|
|
27
|
+
path = recursive_decode(path)
|
|
28
|
+
path = hash_query_strings(path)
|
|
29
|
+
clean_segments(path)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def file_id(archive_url)
|
|
33
|
+
stripped = strip_archive_prefix(archive_url)
|
|
34
|
+
sanitize(stripped)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def segment_for(path_segment)
|
|
38
|
+
cleaned = path_segment.gsub(INVALID_CHARS, "_")
|
|
39
|
+
truncate(cleaned)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private
|
|
43
|
+
|
|
44
|
+
def strip_scheme(url)
|
|
45
|
+
url.to_s.sub(%r{\Ahttps?://}, "")
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def strip_archive_prefix(url)
|
|
49
|
+
url.to_s.sub(%r{\Ahttps?://web\.archive\.org/web/\d+(?:id_)?/}, "")
|
|
50
|
+
.sub(%r{\Ahttps?://}, "")
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def recursive_decode(str)
|
|
54
|
+
MAX_DECODE_ITERATIONS.times do
|
|
55
|
+
decoded = decode(str)
|
|
56
|
+
return decoded if decoded == str
|
|
57
|
+
|
|
58
|
+
str = decoded
|
|
59
|
+
end
|
|
60
|
+
str
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def decode(str)
|
|
64
|
+
URI.decode_www_form_component(str)
|
|
65
|
+
rescue StandardError
|
|
66
|
+
str
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def hash_query_strings(path)
|
|
70
|
+
return path unless path.include?("?")
|
|
71
|
+
|
|
72
|
+
base, query = path.split("?", 2)
|
|
73
|
+
hash = Digest::SHA256.hexdigest(query)[0, HASH_LENGTH]
|
|
74
|
+
"#{base}_#{hash}"
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def clean_segments(path)
|
|
78
|
+
segments = path.split(SEPARATOR_RE).reject(&:empty?)
|
|
79
|
+
return "" if segments.empty?
|
|
80
|
+
|
|
81
|
+
segments.map do |seg|
|
|
82
|
+
segment_for(seg)
|
|
83
|
+
end.join(File::SEPARATOR)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def truncate(segment)
|
|
87
|
+
return segment if segment.length <= @max_segment_length
|
|
88
|
+
|
|
89
|
+
segment[0, @max_segment_length]
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Resolves file/directory path conflicts in download targets.
|
|
94
|
+
#
|
|
95
|
+
# Detects when a file path would block creation of a needed directory
|
|
96
|
+
# (or vice versa) and resolves by relocating the file.
|
|
97
|
+
class PathConflictResolver
|
|
98
|
+
def initialize(base_dir)
|
|
99
|
+
@base_dir = base_dir
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def resolve(paths)
|
|
103
|
+
conflicts = detect_conflicts(paths)
|
|
104
|
+
relocate_conflicts(conflicts)
|
|
105
|
+
paths
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def conflict?(file_path)
|
|
109
|
+
return false if File.directory?(file_path)
|
|
110
|
+
return false unless File.file?(file_path)
|
|
111
|
+
|
|
112
|
+
File.exist?(file_path) && needs_directory_under?(file_path)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
private
|
|
116
|
+
|
|
117
|
+
def detect_conflicts(paths)
|
|
118
|
+
conflicts = Set.new
|
|
119
|
+
paths.each do |path|
|
|
120
|
+
paths.each do |other|
|
|
121
|
+
next if path == other
|
|
122
|
+
|
|
123
|
+
prefix = path + File::SEPARATOR
|
|
124
|
+
if other.start_with?(prefix) && File.file?(path)
|
|
125
|
+
# `path` is a prefix of `other` — if `path` is a file, it blocks `other`
|
|
126
|
+
conflicts << path
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
conflicts.to_a
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def relocate_conflicts(conflicts)
|
|
134
|
+
conflicts.each do |conflict_path|
|
|
135
|
+
next unless File.file?(conflict_path)
|
|
136
|
+
|
|
137
|
+
ext = File.extname(conflict_path)
|
|
138
|
+
tmp_file = "#{conflict_path}.archaeo_tmp"
|
|
139
|
+
FileUtils.mv(conflict_path, tmp_file)
|
|
140
|
+
FileUtils.mkdir_p(conflict_path)
|
|
141
|
+
new_file = File.join(conflict_path, "index#{ext}")
|
|
142
|
+
FileUtils.mv(tmp_file, new_file)
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def needs_directory_under?(file_path)
|
|
147
|
+
parent = File.dirname(file_path)
|
|
148
|
+
children = Dir.glob("#{parent}/*")
|
|
149
|
+
children.any? { |c| File.directory?(c) }
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# URL pattern filter for include/exclude matching during downloads.
|
|
5
|
+
#
|
|
6
|
+
# Supports string substring matching, Regexp objects, and
|
|
7
|
+
# %r{...} and /.../ string-to-regexp conversion with inline flags.
|
|
8
|
+
class PatternFilter
|
|
9
|
+
def initialize(only: nil, exclude: nil)
|
|
10
|
+
@only_patterns = compile_patterns(Array(only))
|
|
11
|
+
@exclude_patterns = compile_patterns(Array(exclude))
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def match?(url)
|
|
15
|
+
url = url.to_s
|
|
16
|
+
return false if excluded?(url)
|
|
17
|
+
return true if @only_patterns.empty?
|
|
18
|
+
|
|
19
|
+
included?(url)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def reject?(url)
|
|
23
|
+
!match?(url)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def self.to_regex(pattern)
|
|
27
|
+
case pattern
|
|
28
|
+
when Regexp then pattern
|
|
29
|
+
when String then parse_regex_string(pattern)
|
|
30
|
+
else
|
|
31
|
+
raise ArgumentError,
|
|
32
|
+
"Pattern must be String or Regexp, got #{pattern.class}"
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
private
|
|
37
|
+
|
|
38
|
+
def compile_patterns(patterns)
|
|
39
|
+
patterns.map { |p| self.class.to_regex(p) }
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def included?(url)
|
|
43
|
+
@only_patterns.any? { |re| url.match?(re) }
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def excluded?(url)
|
|
47
|
+
@exclude_patterns.any? { |re| url.match?(re) }
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
private_class_method def self.parse_regex_string(str)
|
|
51
|
+
stripped = str.strip
|
|
52
|
+
if stripped.start_with?("%r{") && stripped.end_with?("}")
|
|
53
|
+
body = stripped[3..-2]
|
|
54
|
+
build_regex(body)
|
|
55
|
+
elsif stripped.start_with?("/") && stripped.end_with?("/")
|
|
56
|
+
body = stripped[1..-2]
|
|
57
|
+
build_regex(body)
|
|
58
|
+
elsif stripped.start_with?("/") && stripped.length > 1
|
|
59
|
+
last_slash = stripped.rindex("/")
|
|
60
|
+
body = stripped[1...last_slash]
|
|
61
|
+
flags = stripped[(last_slash + 1)..]
|
|
62
|
+
build_regex(body, parse_flags(flags))
|
|
63
|
+
else
|
|
64
|
+
Regexp.new(Regexp.escape(stripped))
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
private_class_method def self.build_regex(body, options = 0)
|
|
69
|
+
Regexp.new(body, options)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
private_class_method def self.parse_flags(flags)
|
|
73
|
+
options = 0
|
|
74
|
+
options |= Regexp::IGNORECASE if flags.include?("i")
|
|
75
|
+
options |= Regexp::MULTILINE if flags.include?("m")
|
|
76
|
+
options |= Regexp::EXTENDED if flags.include?("x")
|
|
77
|
+
options
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Thread-safe request rate limiter.
|
|
5
|
+
#
|
|
6
|
+
# Enforces a minimum interval between requests to avoid hitting
|
|
7
|
+
# Wayback Machine rate limits. Supports per-host limiting and
|
|
8
|
+
# adaptive backoff when 429 responses are received.
|
|
9
|
+
class RateLimiter
|
|
10
|
+
DEFAULT_MIN_INTERVAL = 0
|
|
11
|
+
|
|
12
|
+
def initialize(min_interval: DEFAULT_MIN_INTERVAL)
|
|
13
|
+
@min_interval = min_interval.to_f
|
|
14
|
+
@mutex = Mutex.new
|
|
15
|
+
@last_request_at = 0.0
|
|
16
|
+
@host_last = {}
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def wait(host: nil)
|
|
20
|
+
return if @min_interval <= 0
|
|
21
|
+
|
|
22
|
+
@mutex.synchronize do
|
|
23
|
+
if host
|
|
24
|
+
wait_for_host(host)
|
|
25
|
+
else
|
|
26
|
+
wait_global
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def backoff(host: nil)
|
|
32
|
+
@mutex.synchronize do
|
|
33
|
+
if host
|
|
34
|
+
key = host.to_sym
|
|
35
|
+
current = @host_last[key] || @min_interval
|
|
36
|
+
@host_last[key] = [current * 2, 60].min
|
|
37
|
+
else
|
|
38
|
+
@min_interval = [(@min_interval * 2).clamp(0, 60), 60].min
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
wait(host: host)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def reset(host: nil)
|
|
45
|
+
@mutex.synchronize do
|
|
46
|
+
if host
|
|
47
|
+
@host_last.delete(host.to_sym)
|
|
48
|
+
else
|
|
49
|
+
@last_request_at = 0.0
|
|
50
|
+
@host_last.clear
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def interval
|
|
56
|
+
@mutex.synchronize { @min_interval }
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
private
|
|
60
|
+
|
|
61
|
+
def wait_global
|
|
62
|
+
elapsed = now - @last_request_at
|
|
63
|
+
sleep_for(@min_interval - elapsed) if elapsed < @min_interval
|
|
64
|
+
@last_request_at = now
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def wait_for_host(host)
|
|
68
|
+
key = host.to_sym
|
|
69
|
+
@host_last[key] ||= 0.0
|
|
70
|
+
host_interval = [@min_interval, 0].max
|
|
71
|
+
elapsed = now - @host_last[key]
|
|
72
|
+
sleep_for(host_interval - elapsed) if elapsed < host_interval
|
|
73
|
+
@host_last[key] = now
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def sleep_for(seconds)
|
|
77
|
+
return unless seconds.positive?
|
|
78
|
+
|
|
79
|
+
sleep(seconds)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def now
|
|
83
|
+
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
data/lib/archaeo/save_api.rb
CHANGED
|
@@ -62,8 +62,13 @@ module Archaeo
|
|
|
62
62
|
|
|
63
63
|
ts = Timestamp.parse(extract_timestamp(archive_url))
|
|
64
64
|
cached = ts.to_time < start_time - 2700
|
|
65
|
-
SaveResult.new(
|
|
66
|
-
|
|
65
|
+
SaveResult.new(
|
|
66
|
+
url: url, archive_url: archive_url,
|
|
67
|
+
timestamp: ts, cached: cached,
|
|
68
|
+
status_code: response.status,
|
|
69
|
+
response_url: response.headers["location"],
|
|
70
|
+
response_headers: response.headers
|
|
71
|
+
)
|
|
67
72
|
end
|
|
68
73
|
|
|
69
74
|
def check_response_errors!(response, url)
|
data/lib/archaeo/save_result.rb
CHANGED
|
@@ -3,16 +3,22 @@
|
|
|
3
3
|
module Archaeo
|
|
4
4
|
# Model representing the outcome of a SavePageNow request.
|
|
5
5
|
#
|
|
6
|
-
# Contains the resulting archive URL, timestamp,
|
|
7
|
-
# the page was already cached
|
|
6
|
+
# Contains the resulting archive URL, timestamp, whether
|
|
7
|
+
# the page was already cached, and response metadata.
|
|
8
8
|
class SaveResult
|
|
9
|
-
attr_reader :url, :archive_url, :timestamp
|
|
9
|
+
attr_reader :url, :archive_url, :timestamp, :status_code,
|
|
10
|
+
:response_url, :response_headers
|
|
10
11
|
|
|
11
|
-
def initialize(url:, archive_url:, timestamp:, cached
|
|
12
|
+
def initialize(url:, archive_url:, timestamp:, cached:,
|
|
13
|
+
status_code: nil, response_url: nil,
|
|
14
|
+
response_headers: nil)
|
|
12
15
|
@url = url
|
|
13
16
|
@archive_url = archive_url
|
|
14
17
|
@timestamp = timestamp ? Timestamp.coerce(timestamp) : nil
|
|
15
18
|
@cached = cached
|
|
19
|
+
@status_code = status_code
|
|
20
|
+
@response_url = response_url
|
|
21
|
+
@response_headers = response_headers
|
|
16
22
|
end
|
|
17
23
|
|
|
18
24
|
def cached?
|
|
@@ -24,13 +30,25 @@ module Archaeo
|
|
|
24
30
|
end
|
|
25
31
|
|
|
26
32
|
def to_h
|
|
27
|
-
{
|
|
28
|
-
|
|
33
|
+
{
|
|
34
|
+
url: @url,
|
|
35
|
+
archive_url: @archive_url,
|
|
36
|
+
timestamp: @timestamp,
|
|
37
|
+
cached: @cached,
|
|
38
|
+
status_code: @status_code,
|
|
39
|
+
response_url: @response_url,
|
|
40
|
+
}
|
|
29
41
|
end
|
|
30
42
|
|
|
31
43
|
def as_json(*)
|
|
32
|
-
{
|
|
33
|
-
|
|
44
|
+
{
|
|
45
|
+
url: @url,
|
|
46
|
+
archive_url: @archive_url,
|
|
47
|
+
timestamp: @timestamp.to_s,
|
|
48
|
+
cached: @cached,
|
|
49
|
+
status_code: @status_code,
|
|
50
|
+
response_url: @response_url,
|
|
51
|
+
}
|
|
34
52
|
end
|
|
35
53
|
|
|
36
54
|
def to_s
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "uri"
|
|
4
|
+
|
|
5
|
+
module Archaeo
|
|
6
|
+
# Discovers subdomains from downloaded content.
|
|
7
|
+
#
|
|
8
|
+
# Scans HTML, CSS, and JavaScript files for links to subdomains
|
|
9
|
+
# of a base domain, enabling recursive archival.
|
|
10
|
+
class SubdomainDiscovery
|
|
11
|
+
MULTI_PART_TLDS = %w[
|
|
12
|
+
co.uk com.au co.jp co.nz co.za com.br com.mx
|
|
13
|
+
com.sg co.in co.kr com.tw com.hk org.uk ac.uk
|
|
14
|
+
co.il com.ar co.id co.th com.my com.tr co.ke
|
|
15
|
+
].freeze
|
|
16
|
+
|
|
17
|
+
HTML_URL_ATTRS = %w[href src action].freeze
|
|
18
|
+
HTML_URL_RE = /https?:\/\/([a-z0-9][-a-z0-9.]*[a-z0-9])/i
|
|
19
|
+
CSS_URL_RE = /url\(\s*['"]?(https?:\/\/[^'")\s]+)['"]?\s*\)/i
|
|
20
|
+
JS_STRING_RE = /['"](https?:\/\/[a-z0-9][-a-z0-9.]*[a-z0-9][^\s'"]*)['"]/i
|
|
21
|
+
|
|
22
|
+
def initialize(base_domain, max_depth: 1)
|
|
23
|
+
@base_domain = base_domain.to_s
|
|
24
|
+
@max_depth = max_depth
|
|
25
|
+
@visited = Set.new
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def scan_content(content, content_type:)
|
|
29
|
+
urls = extract_urls(content, content_type)
|
|
30
|
+
filter_subdomains(urls)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def scan_files(directory)
|
|
34
|
+
found = Set.new
|
|
35
|
+
Dir.glob(File.join(directory, "**", "*")).each do |path|
|
|
36
|
+
next unless File.file?(path)
|
|
37
|
+
|
|
38
|
+
content = File.read(path, encoding: "UTF-8",
|
|
39
|
+
invalid: :replace, undef: :replace)
|
|
40
|
+
ext = File.extname(path).downcase
|
|
41
|
+
content_type = content_type_for_ext(ext)
|
|
42
|
+
next unless content_type
|
|
43
|
+
|
|
44
|
+
found.merge(scan_content(content, content_type: content_type))
|
|
45
|
+
end
|
|
46
|
+
found.to_a
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def discover_recursive(directory, depth: 0)
|
|
50
|
+
return [] if depth >= @max_depth
|
|
51
|
+
|
|
52
|
+
subdomains = scan_files(directory)
|
|
53
|
+
new_subdomains = subdomains.reject { |s| @visited.include?(s) }
|
|
54
|
+
@visited.merge(new_subdomains)
|
|
55
|
+
new_subdomains
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def base_domain(host)
|
|
59
|
+
parts = host.to_s.downcase.split(".")
|
|
60
|
+
return host.to_s if parts.length <= 2
|
|
61
|
+
|
|
62
|
+
MULTI_PART_TLDS.each do |tld|
|
|
63
|
+
tld_parts = tld.split(".")
|
|
64
|
+
if parts.last(tld_parts.length) == tld_parts
|
|
65
|
+
return parts.last(tld_parts.length + 1).join(".")
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
parts.last(2).join(".")
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
private
|
|
73
|
+
|
|
74
|
+
def extract_urls(content, content_type)
|
|
75
|
+
case content_type
|
|
76
|
+
when :html then extract_html_urls(content)
|
|
77
|
+
when :css then extract_css_urls(content)
|
|
78
|
+
when :js then extract_js_urls(content)
|
|
79
|
+
else []
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def extract_html_urls(content)
|
|
84
|
+
content.scan(HTML_URL_RE).flatten.map { |h| "https://#{h}" }
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def extract_css_urls(content)
|
|
88
|
+
content.scan(CSS_URL_RE).flatten
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def extract_js_urls(content)
|
|
92
|
+
content.scan(JS_STRING_RE).flatten
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def filter_subdomains(urls)
|
|
96
|
+
base = base_domain(@base_domain)
|
|
97
|
+
urls.filter_map do |url|
|
|
98
|
+
host = begin
|
|
99
|
+
URI.parse(url).host.to_s.downcase
|
|
100
|
+
rescue URI::InvalidURIError
|
|
101
|
+
next
|
|
102
|
+
end
|
|
103
|
+
next unless host.end_with?(".#{base}") && host != base
|
|
104
|
+
|
|
105
|
+
host
|
|
106
|
+
end.uniq
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def content_type_for_ext(ext)
|
|
110
|
+
case ext
|
|
111
|
+
when ".html", ".htm" then :html
|
|
112
|
+
when ".css" then :css
|
|
113
|
+
when ".js" then :js
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|