archaeo 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Detects and transcodes content from legacy encodings to UTF-8.
5
+ #
6
+ # Tries a configurable list of encodings in priority order,
7
+ # returning the first that produces valid output. Used as a
8
+ # fallback when Content-Type charset and HTML meta charset are
9
+ # both absent.
10
+ class EncodingDetector
11
+ DEFAULT_ENCODINGS = [
12
+ Encoding::UTF_8,
13
+ Encoding::Windows_1251,
14
+ Encoding::GB18030,
15
+ Encoding::Shift_JIS,
16
+ Encoding::EUC_KR,
17
+ Encoding::ISO_8859_1,
18
+ Encoding::Windows_1252,
19
+ ].freeze
20
+
21
+ BINARY_THRESHOLD = 0.1
22
+ TEXT_CONTROL_BYTES = [0x09, 0x0A, 0x0D].freeze
23
+
24
+ def initialize(encodings: DEFAULT_ENCODINGS)
25
+ @encodings = encodings
26
+ end
27
+
28
+ def detect(bytes)
29
+ return Encoding::UTF_8 if bytes.nil? || bytes.empty?
30
+
31
+ string = bytes_to_string(bytes)
32
+
33
+ @encodings.each do |enc|
34
+ return enc if valid_in_encoding?(string, enc)
35
+ end
36
+
37
+ Encoding::UTF_8
38
+ end
39
+
40
+ def transcode(bytes, fallback: Encoding::UTF_8)
41
+ return "" if bytes.nil? || bytes.empty?
42
+
43
+ string = bytes.is_a?(String) ? bytes.dup : bytes.to_s
44
+ return string if string.encoding == Encoding::UTF_8 && string.valid_encoding?
45
+
46
+ binary = bytes_to_string(bytes)
47
+ detected = detect(bytes)
48
+ return binary.force_encoding(Encoding::UTF_8) if detected == Encoding::UTF_8
49
+
50
+ encode_to_utf8(binary, detected, fallback)
51
+ end
52
+
53
+ def binary?(bytes)
54
+ return false if bytes.nil? || bytes.empty?
55
+
56
+ sample = bytes.byteslice(0, [bytes.bytesize, 4096].min)
57
+ non_printable = sample.bytes.count do |b|
58
+ b < 0x20 && !TEXT_CONTROL_BYTES.include?(b)
59
+ end
60
+ non_printable.to_f / sample.bytesize > BINARY_THRESHOLD
61
+ end
62
+
63
+ private
64
+
65
+ def bytes_to_string(bytes)
66
+ case bytes
67
+ when String then bytes.dup.force_encoding(Encoding::ASCII_8BIT)
68
+ else bytes.to_s.force_encoding(Encoding::ASCII_8BIT)
69
+ end
70
+ end
71
+
72
+ def valid_in_encoding?(string, encoding)
73
+ candidate = string.dup.force_encoding(encoding)
74
+ candidate.valid_encoding?
75
+ rescue StandardError
76
+ false
77
+ end
78
+
79
+ def encode_to_utf8(string, source_encoding, fallback)
80
+ candidate = string.dup.force_encoding(source_encoding)
81
+ candidate.encode(Encoding::UTF_8,
82
+ invalid: :replace, undef: :replace,
83
+ replace: "?")
84
+ rescue StandardError
85
+ string.dup.force_encoding(fallback)
86
+ .encode(Encoding::UTF_8,
87
+ invalid: :replace, undef: :replace,
88
+ replace: "?")
89
+ end
90
+ end
91
+ end
data/lib/archaeo/page.rb CHANGED
@@ -130,7 +130,7 @@ module Archaeo
130
130
  html_charset = detect_html_charset
131
131
  return Encoding.find(html_charset) if html_charset
132
132
 
133
- Encoding::UTF_8
133
+ EncodingDetector.new.detect(@raw_content)
134
134
  rescue ArgumentError
135
135
  Encoding::UTF_8
136
136
  end
@@ -0,0 +1,152 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+ require "uri"
5
+
6
+ module Archaeo
7
+ # Sanitizes URLs into safe filesystem paths.
8
+ #
9
+ # Handles recursive percent-decoding, query string hashing,
10
+ # segment truncation, and invalid character replacement.
11
+ class PathSanitizer
12
+ DEFAULT_MAX_SEGMENT = 200
13
+ HASH_LENGTH = 8
14
+ MAX_DECODE_ITERATIONS = 5
15
+
16
+ INVALID_CHARS = /[<>:"|?*#]/
17
+ SEPARATOR_RE = %r{[/\\]}
18
+
19
+ attr_reader :max_segment_length
20
+
21
+ def initialize(max_segment_length: DEFAULT_MAX_SEGMENT)
22
+ @max_segment_length = max_segment_length
23
+ end
24
+
25
+ def sanitize(url)
26
+ path = strip_scheme(url)
27
+ path = recursive_decode(path)
28
+ path = hash_query_strings(path)
29
+ clean_segments(path)
30
+ end
31
+
32
+ def file_id(archive_url)
33
+ stripped = strip_archive_prefix(archive_url)
34
+ sanitize(stripped)
35
+ end
36
+
37
+ def segment_for(path_segment)
38
+ cleaned = path_segment.gsub(INVALID_CHARS, "_")
39
+ truncate(cleaned)
40
+ end
41
+
42
+ private
43
+
44
+ def strip_scheme(url)
45
+ url.to_s.sub(%r{\Ahttps?://}, "")
46
+ end
47
+
48
+ def strip_archive_prefix(url)
49
+ url.to_s.sub(%r{\Ahttps?://web\.archive\.org/web/\d+(?:id_)?/}, "")
50
+ .sub(%r{\Ahttps?://}, "")
51
+ end
52
+
53
+ def recursive_decode(str)
54
+ MAX_DECODE_ITERATIONS.times do
55
+ decoded = decode(str)
56
+ return decoded if decoded == str
57
+
58
+ str = decoded
59
+ end
60
+ str
61
+ end
62
+
63
+ def decode(str)
64
+ URI.decode_www_form_component(str)
65
+ rescue StandardError
66
+ str
67
+ end
68
+
69
+ def hash_query_strings(path)
70
+ return path unless path.include?("?")
71
+
72
+ base, query = path.split("?", 2)
73
+ hash = Digest::SHA256.hexdigest(query)[0, HASH_LENGTH]
74
+ "#{base}_#{hash}"
75
+ end
76
+
77
+ def clean_segments(path)
78
+ segments = path.split(SEPARATOR_RE).reject(&:empty?)
79
+ return "" if segments.empty?
80
+
81
+ segments.map do |seg|
82
+ segment_for(seg)
83
+ end.join(File::SEPARATOR)
84
+ end
85
+
86
+ def truncate(segment)
87
+ return segment if segment.length <= @max_segment_length
88
+
89
+ segment[0, @max_segment_length]
90
+ end
91
+ end
92
+
93
+ # Resolves file/directory path conflicts in download targets.
94
+ #
95
+ # Detects when a file path would block creation of a needed directory
96
+ # (or vice versa) and resolves by relocating the file.
97
+ class PathConflictResolver
98
+ def initialize(base_dir)
99
+ @base_dir = base_dir
100
+ end
101
+
102
+ def resolve(paths)
103
+ conflicts = detect_conflicts(paths)
104
+ relocate_conflicts(conflicts)
105
+ paths
106
+ end
107
+
108
+ def conflict?(file_path)
109
+ return false if File.directory?(file_path)
110
+ return false unless File.file?(file_path)
111
+
112
+ File.exist?(file_path) && needs_directory_under?(file_path)
113
+ end
114
+
115
+ private
116
+
117
+ def detect_conflicts(paths)
118
+ conflicts = Set.new
119
+ paths.each do |path|
120
+ paths.each do |other|
121
+ next if path == other
122
+
123
+ prefix = path + File::SEPARATOR
124
+ if other.start_with?(prefix) && File.file?(path)
125
+ # `path` is a prefix of `other` — if `path` is a file, it blocks `other`
126
+ conflicts << path
127
+ end
128
+ end
129
+ end
130
+ conflicts.to_a
131
+ end
132
+
133
+ def relocate_conflicts(conflicts)
134
+ conflicts.each do |conflict_path|
135
+ next unless File.file?(conflict_path)
136
+
137
+ ext = File.extname(conflict_path)
138
+ tmp_file = "#{conflict_path}.archaeo_tmp"
139
+ FileUtils.mv(conflict_path, tmp_file)
140
+ FileUtils.mkdir_p(conflict_path)
141
+ new_file = File.join(conflict_path, "index#{ext}")
142
+ FileUtils.mv(tmp_file, new_file)
143
+ end
144
+ end
145
+
146
+ def needs_directory_under?(file_path)
147
+ parent = File.dirname(file_path)
148
+ children = Dir.glob("#{parent}/*")
149
+ children.any? { |c| File.directory?(c) }
150
+ end
151
+ end
152
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # URL pattern filter for include/exclude matching during downloads.
5
+ #
6
+ # Supports string substring matching, Regexp objects, and
7
+ # %r{...} and /.../ string-to-regexp conversion with inline flags.
8
+ class PatternFilter
9
+ def initialize(only: nil, exclude: nil)
10
+ @only_patterns = compile_patterns(Array(only))
11
+ @exclude_patterns = compile_patterns(Array(exclude))
12
+ end
13
+
14
+ def match?(url)
15
+ url = url.to_s
16
+ return false if excluded?(url)
17
+ return true if @only_patterns.empty?
18
+
19
+ included?(url)
20
+ end
21
+
22
+ def reject?(url)
23
+ !match?(url)
24
+ end
25
+
26
+ def self.to_regex(pattern)
27
+ case pattern
28
+ when Regexp then pattern
29
+ when String then parse_regex_string(pattern)
30
+ else
31
+ raise ArgumentError,
32
+ "Pattern must be String or Regexp, got #{pattern.class}"
33
+ end
34
+ end
35
+
36
+ private
37
+
38
+ def compile_patterns(patterns)
39
+ patterns.map { |p| self.class.to_regex(p) }
40
+ end
41
+
42
+ def included?(url)
43
+ @only_patterns.any? { |re| url.match?(re) }
44
+ end
45
+
46
+ def excluded?(url)
47
+ @exclude_patterns.any? { |re| url.match?(re) }
48
+ end
49
+
50
+ private_class_method def self.parse_regex_string(str)
51
+ stripped = str.strip
52
+ if stripped.start_with?("%r{") && stripped.end_with?("}")
53
+ body = stripped[3..-2]
54
+ build_regex(body)
55
+ elsif stripped.start_with?("/") && stripped.end_with?("/")
56
+ body = stripped[1..-2]
57
+ build_regex(body)
58
+ elsif stripped.start_with?("/") && stripped.length > 1
59
+ last_slash = stripped.rindex("/")
60
+ body = stripped[1...last_slash]
61
+ flags = stripped[(last_slash + 1)..]
62
+ build_regex(body, parse_flags(flags))
63
+ else
64
+ Regexp.new(Regexp.escape(stripped))
65
+ end
66
+ end
67
+
68
+ private_class_method def self.build_regex(body, options = 0)
69
+ Regexp.new(body, options)
70
+ end
71
+
72
+ private_class_method def self.parse_flags(flags)
73
+ options = 0
74
+ options |= Regexp::IGNORECASE if flags.include?("i")
75
+ options |= Regexp::MULTILINE if flags.include?("m")
76
+ options |= Regexp::EXTENDED if flags.include?("x")
77
+ options
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,86 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Thread-safe request rate limiter.
5
+ #
6
+ # Enforces a minimum interval between requests to avoid hitting
7
+ # Wayback Machine rate limits. Supports per-host limiting and
8
+ # adaptive backoff when 429 responses are received.
9
+ class RateLimiter
10
+ DEFAULT_MIN_INTERVAL = 0
11
+
12
+ def initialize(min_interval: DEFAULT_MIN_INTERVAL)
13
+ @min_interval = min_interval.to_f
14
+ @mutex = Mutex.new
15
+ @last_request_at = 0.0
16
+ @host_last = {}
17
+ end
18
+
19
+ def wait(host: nil)
20
+ return if @min_interval <= 0
21
+
22
+ @mutex.synchronize do
23
+ if host
24
+ wait_for_host(host)
25
+ else
26
+ wait_global
27
+ end
28
+ end
29
+ end
30
+
31
+ def backoff(host: nil)
32
+ @mutex.synchronize do
33
+ if host
34
+ key = host.to_sym
35
+ current = @host_last[key] || @min_interval
36
+ @host_last[key] = [current * 2, 60].min
37
+ else
38
+ @min_interval = [(@min_interval * 2).clamp(0, 60), 60].min
39
+ end
40
+ end
41
+ wait(host: host)
42
+ end
43
+
44
+ def reset(host: nil)
45
+ @mutex.synchronize do
46
+ if host
47
+ @host_last.delete(host.to_sym)
48
+ else
49
+ @last_request_at = 0.0
50
+ @host_last.clear
51
+ end
52
+ end
53
+ end
54
+
55
+ def interval
56
+ @mutex.synchronize { @min_interval }
57
+ end
58
+
59
+ private
60
+
61
+ def wait_global
62
+ elapsed = now - @last_request_at
63
+ sleep_for(@min_interval - elapsed) if elapsed < @min_interval
64
+ @last_request_at = now
65
+ end
66
+
67
+ def wait_for_host(host)
68
+ key = host.to_sym
69
+ @host_last[key] ||= 0.0
70
+ host_interval = [@min_interval, 0].max
71
+ elapsed = now - @host_last[key]
72
+ sleep_for(host_interval - elapsed) if elapsed < host_interval
73
+ @host_last[key] = now
74
+ end
75
+
76
+ def sleep_for(seconds)
77
+ return unless seconds.positive?
78
+
79
+ sleep(seconds)
80
+ end
81
+
82
+ def now
83
+ Process.clock_gettime(Process::CLOCK_MONOTONIC)
84
+ end
85
+ end
86
+ end
@@ -62,8 +62,13 @@ module Archaeo
62
62
 
63
63
  ts = Timestamp.parse(extract_timestamp(archive_url))
64
64
  cached = ts.to_time < start_time - 2700
65
- SaveResult.new(url: url, archive_url: archive_url,
66
- timestamp: ts, cached: cached)
65
+ SaveResult.new(
66
+ url: url, archive_url: archive_url,
67
+ timestamp: ts, cached: cached,
68
+ status_code: response.status,
69
+ response_url: response.headers["location"],
70
+ response_headers: response.headers
71
+ )
67
72
  end
68
73
 
69
74
  def check_response_errors!(response, url)
@@ -3,16 +3,22 @@
3
3
  module Archaeo
4
4
  # Model representing the outcome of a SavePageNow request.
5
5
  #
6
- # Contains the resulting archive URL, timestamp, and whether
7
- # the page was already cached in the archive.
6
+ # Contains the resulting archive URL, timestamp, whether
7
+ # the page was already cached, and response metadata.
8
8
  class SaveResult
9
- attr_reader :url, :archive_url, :timestamp
9
+ attr_reader :url, :archive_url, :timestamp, :status_code,
10
+ :response_url, :response_headers
10
11
 
11
- def initialize(url:, archive_url:, timestamp:, cached:)
12
+ def initialize(url:, archive_url:, timestamp:, cached:,
13
+ status_code: nil, response_url: nil,
14
+ response_headers: nil)
12
15
  @url = url
13
16
  @archive_url = archive_url
14
17
  @timestamp = timestamp ? Timestamp.coerce(timestamp) : nil
15
18
  @cached = cached
19
+ @status_code = status_code
20
+ @response_url = response_url
21
+ @response_headers = response_headers
16
22
  end
17
23
 
18
24
  def cached?
@@ -24,13 +30,25 @@ module Archaeo
24
30
  end
25
31
 
26
32
  def to_h
27
- { url: @url, archive_url: @archive_url,
28
- timestamp: @timestamp, cached: @cached }
33
+ {
34
+ url: @url,
35
+ archive_url: @archive_url,
36
+ timestamp: @timestamp,
37
+ cached: @cached,
38
+ status_code: @status_code,
39
+ response_url: @response_url,
40
+ }
29
41
  end
30
42
 
31
43
  def as_json(*)
32
- { url: @url, archive_url: @archive_url,
33
- timestamp: @timestamp.to_s, cached: @cached }
44
+ {
45
+ url: @url,
46
+ archive_url: @archive_url,
47
+ timestamp: @timestamp.to_s,
48
+ cached: @cached,
49
+ status_code: @status_code,
50
+ response_url: @response_url,
51
+ }
34
52
  end
35
53
 
36
54
  def to_s
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module Archaeo
6
+ # Discovers subdomains from downloaded content.
7
+ #
8
+ # Scans HTML, CSS, and JavaScript files for links to subdomains
9
+ # of a base domain, enabling recursive archival.
10
+ class SubdomainDiscovery
11
+ MULTI_PART_TLDS = %w[
12
+ co.uk com.au co.jp co.nz co.za com.br com.mx
13
+ com.sg co.in co.kr com.tw com.hk org.uk ac.uk
14
+ co.il com.ar co.id co.th com.my com.tr co.ke
15
+ ].freeze
16
+
17
+ HTML_URL_ATTRS = %w[href src action].freeze
18
+ HTML_URL_RE = /https?:\/\/([a-z0-9][-a-z0-9.]*[a-z0-9])/i
19
+ CSS_URL_RE = /url\(\s*['"]?(https?:\/\/[^'")\s]+)['"]?\s*\)/i
20
+ JS_STRING_RE = /['"](https?:\/\/[a-z0-9][-a-z0-9.]*[a-z0-9][^\s'"]*)['"]/i
21
+
22
+ def initialize(base_domain, max_depth: 1)
23
+ @base_domain = base_domain.to_s
24
+ @max_depth = max_depth
25
+ @visited = Set.new
26
+ end
27
+
28
+ def scan_content(content, content_type:)
29
+ urls = extract_urls(content, content_type)
30
+ filter_subdomains(urls)
31
+ end
32
+
33
+ def scan_files(directory)
34
+ found = Set.new
35
+ Dir.glob(File.join(directory, "**", "*")).each do |path|
36
+ next unless File.file?(path)
37
+
38
+ content = File.read(path, encoding: "UTF-8",
39
+ invalid: :replace, undef: :replace)
40
+ ext = File.extname(path).downcase
41
+ content_type = content_type_for_ext(ext)
42
+ next unless content_type
43
+
44
+ found.merge(scan_content(content, content_type: content_type))
45
+ end
46
+ found.to_a
47
+ end
48
+
49
+ def discover_recursive(directory, depth: 0)
50
+ return [] if depth >= @max_depth
51
+
52
+ subdomains = scan_files(directory)
53
+ new_subdomains = subdomains.reject { |s| @visited.include?(s) }
54
+ @visited.merge(new_subdomains)
55
+ new_subdomains
56
+ end
57
+
58
+ def base_domain(host)
59
+ parts = host.to_s.downcase.split(".")
60
+ return host.to_s if parts.length <= 2
61
+
62
+ MULTI_PART_TLDS.each do |tld|
63
+ tld_parts = tld.split(".")
64
+ if parts.last(tld_parts.length) == tld_parts
65
+ return parts.last(tld_parts.length + 1).join(".")
66
+ end
67
+ end
68
+
69
+ parts.last(2).join(".")
70
+ end
71
+
72
+ private
73
+
74
+ def extract_urls(content, content_type)
75
+ case content_type
76
+ when :html then extract_html_urls(content)
77
+ when :css then extract_css_urls(content)
78
+ when :js then extract_js_urls(content)
79
+ else []
80
+ end
81
+ end
82
+
83
+ def extract_html_urls(content)
84
+ content.scan(HTML_URL_RE).flatten.map { |h| "https://#{h}" }
85
+ end
86
+
87
+ def extract_css_urls(content)
88
+ content.scan(CSS_URL_RE).flatten
89
+ end
90
+
91
+ def extract_js_urls(content)
92
+ content.scan(JS_STRING_RE).flatten
93
+ end
94
+
95
+ def filter_subdomains(urls)
96
+ base = base_domain(@base_domain)
97
+ urls.filter_map do |url|
98
+ host = begin
99
+ URI.parse(url).host.to_s.downcase
100
+ rescue URI::InvalidURIError
101
+ next
102
+ end
103
+ next unless host.end_with?(".#{base}") && host != base
104
+
105
+ host
106
+ end.uniq
107
+ end
108
+
109
+ def content_type_for_ext(ext)
110
+ case ext
111
+ when ".html", ".htm" then :html
112
+ when ".css" then :css
113
+ when ".js" then :js
114
+ end
115
+ end
116
+ end
117
+ end