archaeo 0.2.11 → 0.2.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/archaeo/archive_health_check.rb +0 -14
- data/lib/archaeo/bulk_downloader.rb +3 -3
- data/lib/archaeo/coverage_analyzer.rb +50 -0
- data/lib/archaeo/coverage_report.rb +0 -47
- data/lib/archaeo/download_state.rb +3 -6
- data/lib/archaeo/health_report.rb +13 -0
- data/lib/archaeo/page.rb +24 -49
- data/lib/archaeo/snapshot_diff.rb +0 -1
- data/lib/archaeo/version.rb +1 -1
- data/lib/archaeo.rb +3 -0
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5c83d2255fd16d4af732415985d1dcc48ca84711221e4b893838124252ffc062
|
|
4
|
+
data.tar.gz: '08c5a7e88bfcae8db40e47f7c1aeac2d4930f9354a94d50af2fd9d6da7e5846c'
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5c264ca7c5cfb256929599426c734927573f6ea8836fd43f1d7524765aa060aee8a7c0fec3a8dd3a9bbc01f8ab441d52428f4245ca66ff02a75c8e8ef1341e38
|
|
7
|
+
data.tar.gz: 35c17304038f91f8bd898a8076efd840faa5a35a46cac17e73c46bc40c44f5a4d04ca94093aef0226be27f53afb4deb5754a9ab50944ec76c80e9cc791c30830
|
|
@@ -1,20 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Archaeo
|
|
4
|
-
# Verifies that archived snapshots are still accessible.
|
|
5
|
-
#
|
|
6
|
-
# Checks each snapshot by performing HEAD requests to the
|
|
7
|
-
# archive URL and reporting accessibility status.
|
|
8
|
-
HealthReport = Struct.new(
|
|
9
|
-
:total, :accessible, :missing, :errors, :details,
|
|
10
|
-
keyword_init: true
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
HealthDetail = Struct.new(
|
|
14
|
-
:snapshot, :status, :error,
|
|
15
|
-
keyword_init: true
|
|
16
|
-
)
|
|
17
|
-
|
|
18
4
|
class ArchiveHealthCheck
|
|
19
5
|
def initialize(client: HttpClient.new, cdx_api: nil)
|
|
20
6
|
@client = client
|
|
@@ -245,9 +245,6 @@ module Archaeo
|
|
|
245
245
|
page = fetch_page(snapshot)
|
|
246
246
|
validate_page_status(page, snapshot)
|
|
247
247
|
write_page_file(page, snapshot)
|
|
248
|
-
rescue StandardError
|
|
249
|
-
FileUtils.rm_f(tmp_path) if defined?(tmp_path)
|
|
250
|
-
raise
|
|
251
248
|
end
|
|
252
249
|
|
|
253
250
|
def fetch_page(snapshot)
|
|
@@ -272,6 +269,9 @@ module Archaeo
|
|
|
272
269
|
File.binwrite(tmp_path, page.content)
|
|
273
270
|
File.rename(tmp_path, filename)
|
|
274
271
|
page.content
|
|
272
|
+
rescue StandardError
|
|
273
|
+
FileUtils.rm_f(tmp_path) if defined?(tmp_path)
|
|
274
|
+
raise
|
|
275
275
|
end
|
|
276
276
|
|
|
277
277
|
EXTENSION_MAP = {
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Builds a CoverageReport from CDX snapshot data.
|
|
5
|
+
class CoverageAnalyzer
|
|
6
|
+
def initialize(cdx_api: nil)
|
|
7
|
+
@cdx_api = cdx_api
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def analyze(url, from: nil, to: nil)
|
|
11
|
+
cdx = @cdx_api || CdxApi.new
|
|
12
|
+
snapshots = cdx.snapshots(url, from: from, to: to).to_a
|
|
13
|
+
|
|
14
|
+
unique_urls = snapshots.map(&:original_url).uniq
|
|
15
|
+
status_dist = compute_status_distribution(snapshots)
|
|
16
|
+
gaps = compute_temporal_gaps(snapshots)
|
|
17
|
+
|
|
18
|
+
CoverageReport.new(
|
|
19
|
+
url: url,
|
|
20
|
+
total_urls: unique_urls.size,
|
|
21
|
+
archived_urls: snapshots.count(&:success?),
|
|
22
|
+
status_distribution: status_dist,
|
|
23
|
+
temporal_gaps: gaps,
|
|
24
|
+
)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
def compute_status_distribution(snapshots)
|
|
30
|
+
snapshots.each_with_object(Hash.new(0)) do |snap, counts|
|
|
31
|
+
counts[snap.status_code] += 1
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def compute_temporal_gaps(snapshots)
|
|
36
|
+
return [] if snapshots.size < 2
|
|
37
|
+
|
|
38
|
+
sorted = snapshots.sort_by(&:timestamp)
|
|
39
|
+
gaps = []
|
|
40
|
+
sorted.each_cons(2) do |a, b|
|
|
41
|
+
diff_days = (b.timestamp.to_time - a.timestamp.to_time) / 86400
|
|
42
|
+
next unless diff_days > 30
|
|
43
|
+
|
|
44
|
+
gaps << { from: a.timestamp.to_s, to: b.timestamp.to_s,
|
|
45
|
+
gap_days: diff_days.round }
|
|
46
|
+
end
|
|
47
|
+
gaps
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -51,51 +51,4 @@ module Archaeo
|
|
|
51
51
|
to_h
|
|
52
52
|
end
|
|
53
53
|
end
|
|
54
|
-
|
|
55
|
-
# Builds a CoverageReport from CDX snapshot data.
|
|
56
|
-
class CoverageAnalyzer
|
|
57
|
-
def initialize(cdx_api: nil)
|
|
58
|
-
@cdx_api = cdx_api
|
|
59
|
-
end
|
|
60
|
-
|
|
61
|
-
def analyze(url, from: nil, to: nil)
|
|
62
|
-
cdx = @cdx_api || CdxApi.new
|
|
63
|
-
snapshots = cdx.snapshots(url, from: from, to: to).to_a
|
|
64
|
-
|
|
65
|
-
unique_urls = snapshots.map(&:original_url).uniq
|
|
66
|
-
status_dist = compute_status_distribution(snapshots)
|
|
67
|
-
gaps = compute_temporal_gaps(snapshots)
|
|
68
|
-
|
|
69
|
-
CoverageReport.new(
|
|
70
|
-
url: url,
|
|
71
|
-
total_urls: unique_urls.size,
|
|
72
|
-
archived_urls: snapshots.count(&:success?),
|
|
73
|
-
status_distribution: status_dist,
|
|
74
|
-
temporal_gaps: gaps,
|
|
75
|
-
)
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
private
|
|
79
|
-
|
|
80
|
-
def compute_status_distribution(snapshots)
|
|
81
|
-
snapshots.each_with_object(Hash.new(0)) do |snap, counts|
|
|
82
|
-
counts[snap.status_code] += 1
|
|
83
|
-
end
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
def compute_temporal_gaps(snapshots)
|
|
87
|
-
return [] if snapshots.size < 2
|
|
88
|
-
|
|
89
|
-
sorted = snapshots.sort_by(&:timestamp)
|
|
90
|
-
gaps = []
|
|
91
|
-
sorted.each_cons(2) do |a, b|
|
|
92
|
-
diff_days = (b.timestamp.to_time - a.timestamp.to_time) / 86400
|
|
93
|
-
next unless diff_days > 30
|
|
94
|
-
|
|
95
|
-
gaps << { from: a.timestamp.to_s, to: b.timestamp.to_s,
|
|
96
|
-
gap_days: diff_days.round }
|
|
97
|
-
end
|
|
98
|
-
gaps
|
|
99
|
-
end
|
|
100
|
-
end
|
|
101
54
|
end
|
|
@@ -73,18 +73,15 @@ module Archaeo
|
|
|
73
73
|
def stale_entries(base_dir: @output_dir)
|
|
74
74
|
@mutex.synchronize do
|
|
75
75
|
entries.reject do |e|
|
|
76
|
-
find_file(base_dir,
|
|
77
|
-
|
|
76
|
+
path = find_file(base_dir, e["ts"])
|
|
77
|
+
path && File.exist?(path)
|
|
78
78
|
end
|
|
79
79
|
end
|
|
80
80
|
end
|
|
81
81
|
|
|
82
82
|
def cleanup_stale(base_dir: @output_dir)
|
|
83
83
|
@mutex.synchronize do
|
|
84
|
-
stale =
|
|
85
|
-
path = find_file(base_dir, e["ts"])
|
|
86
|
-
path && File.exist?(path)
|
|
87
|
-
end
|
|
84
|
+
stale = stale_entries(base_dir: base_dir)
|
|
88
85
|
@entries = entries - stale
|
|
89
86
|
@entries_key = nil
|
|
90
87
|
save
|
data/lib/archaeo/page.rb
CHANGED
|
@@ -59,21 +59,15 @@ module Archaeo
|
|
|
59
59
|
end
|
|
60
60
|
|
|
61
61
|
def title
|
|
62
|
-
@title ||=
|
|
63
|
-
doc = Nokogiri::HTML(@raw_content)
|
|
64
|
-
doc.at_css("title")&.text&.strip
|
|
65
|
-
rescue StandardError
|
|
66
|
-
nil
|
|
67
|
-
end
|
|
62
|
+
@title ||= html_doc.at_css("title")&.text&.strip
|
|
68
63
|
end
|
|
69
64
|
|
|
70
65
|
def links
|
|
71
66
|
return [] unless html?
|
|
72
67
|
|
|
73
68
|
@links ||= begin
|
|
74
|
-
doc = Nokogiri::HTML(@raw_content)
|
|
75
69
|
base = @archive_url || @original_url
|
|
76
|
-
|
|
70
|
+
html_doc.css("a[href]").map do |anchor|
|
|
77
71
|
href = resolve_page_url(anchor["href"], base)
|
|
78
72
|
{ href: href, text: anchor.text.strip,
|
|
79
73
|
external: href && !href.include?(original_domain) }
|
|
@@ -85,9 +79,8 @@ module Archaeo
|
|
|
85
79
|
return {} unless html?
|
|
86
80
|
|
|
87
81
|
@meta_tags ||= begin
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
canonical = doc.at_css('link[rel="canonical"]')
|
|
82
|
+
result = extract_meta_entries(html_doc)
|
|
83
|
+
canonical = html_doc.at_css('link[rel="canonical"]')
|
|
91
84
|
result["canonical"] = canonical["href"].to_s if canonical
|
|
92
85
|
result
|
|
93
86
|
end
|
|
@@ -96,46 +89,34 @@ module Archaeo
|
|
|
96
89
|
def headings
|
|
97
90
|
return [] unless html?
|
|
98
91
|
|
|
99
|
-
@headings ||=
|
|
100
|
-
|
|
101
|
-
doc.css("h1, h2, h3, h4, h5, h6").map do |el|
|
|
102
|
-
{ level: el.name[1].to_i, text: el.text.strip }
|
|
103
|
-
end
|
|
92
|
+
@headings ||= html_doc.css("h1, h2, h3, h4, h5, h6").map do |el|
|
|
93
|
+
{ level: el.name[1].to_i, text: el.text.strip }
|
|
104
94
|
end
|
|
105
95
|
end
|
|
106
96
|
|
|
107
97
|
def images
|
|
108
98
|
return [] unless html?
|
|
109
99
|
|
|
110
|
-
@images ||=
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
{ src: el["src"], alt: el["alt"].to_s,
|
|
114
|
-
width: el["width"]&.to_i, height: el["height"]&.to_i }
|
|
115
|
-
end
|
|
100
|
+
@images ||= html_doc.css("img[src]").map do |el|
|
|
101
|
+
{ src: el["src"], alt: el["alt"].to_s,
|
|
102
|
+
width: el["width"]&.to_i, height: el["height"]&.to_i }
|
|
116
103
|
end
|
|
117
104
|
end
|
|
118
105
|
|
|
119
106
|
def forms
|
|
120
107
|
return [] unless html?
|
|
121
108
|
|
|
122
|
-
@forms ||=
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
{ action: form["action"].to_s, method: (form["method"] || "GET").upcase,
|
|
126
|
-
fields: extract_form_fields(form) }
|
|
127
|
-
end
|
|
109
|
+
@forms ||= html_doc.css("form").map do |form|
|
|
110
|
+
{ action: form["action"].to_s, method: (form["method"] || "GET").upcase,
|
|
111
|
+
fields: extract_form_fields(form) }
|
|
128
112
|
end
|
|
129
113
|
end
|
|
130
114
|
|
|
131
115
|
def scripts
|
|
132
116
|
return [] unless html?
|
|
133
117
|
|
|
134
|
-
@scripts ||=
|
|
135
|
-
|
|
136
|
-
doc.css("script").map do |el|
|
|
137
|
-
{ src: el["src"].to_s, type: el["type"].to_s }
|
|
138
|
-
end
|
|
118
|
+
@scripts ||= html_doc.css("script").map do |el|
|
|
119
|
+
{ src: el["src"].to_s, type: el["type"].to_s }
|
|
139
120
|
end
|
|
140
121
|
end
|
|
141
122
|
|
|
@@ -143,8 +124,7 @@ module Archaeo
|
|
|
143
124
|
return [] unless html?
|
|
144
125
|
|
|
145
126
|
@microposts ||= begin
|
|
146
|
-
|
|
147
|
-
containers = find_article_containers(doc)
|
|
127
|
+
containers = find_article_containers(html_doc)
|
|
148
128
|
containers.filter_map { |el| extract_micropost(el) }
|
|
149
129
|
end
|
|
150
130
|
end
|
|
@@ -162,15 +142,7 @@ module Archaeo
|
|
|
162
142
|
end
|
|
163
143
|
|
|
164
144
|
def as_json(*)
|
|
165
|
-
{
|
|
166
|
-
content_type: @content_type,
|
|
167
|
-
status_code: @status_code,
|
|
168
|
-
archive_url: @archive_url,
|
|
169
|
-
original_url: @original_url,
|
|
170
|
-
timestamp: @timestamp.to_s,
|
|
171
|
-
size: size,
|
|
172
|
-
encoding: encoding.to_s,
|
|
173
|
-
}
|
|
145
|
+
to_h.transform_values { |v| v.is_a?(Timestamp) ? v.to_s : v }
|
|
174
146
|
end
|
|
175
147
|
|
|
176
148
|
def inspect
|
|
@@ -179,6 +151,10 @@ module Archaeo
|
|
|
179
151
|
|
|
180
152
|
private
|
|
181
153
|
|
|
154
|
+
def html_doc
|
|
155
|
+
@html_doc ||= Nokogiri::HTML(@raw_content)
|
|
156
|
+
end
|
|
157
|
+
|
|
182
158
|
def detect_encoding
|
|
183
159
|
charset = extract_charset(@content_type)
|
|
184
160
|
return Encoding.find(charset) if charset
|
|
@@ -199,11 +175,10 @@ module Archaeo
|
|
|
199
175
|
end
|
|
200
176
|
|
|
201
177
|
def detect_html_charset
|
|
202
|
-
|
|
203
|
-
node = doc.at_css("meta[charset]")
|
|
178
|
+
node = html_doc.at_css("meta[charset]")
|
|
204
179
|
return node["charset"] if node
|
|
205
180
|
|
|
206
|
-
content =
|
|
181
|
+
content = html_doc.at_css('meta[http-equiv="Content-Type"]')&.[]("content")
|
|
207
182
|
return nil unless content
|
|
208
183
|
|
|
209
184
|
match = content.match(/charset=([^\s;]+)/i)
|
|
@@ -250,8 +225,8 @@ module Archaeo
|
|
|
250
225
|
|
|
251
226
|
def resolve_page_url(href, base)
|
|
252
227
|
return href unless href
|
|
253
|
-
return href if href.start_with?("http", "
|
|
254
|
-
"javascript:")
|
|
228
|
+
return href if href.start_with?("http:", "https:", "//", "data:",
|
|
229
|
+
"#", "javascript:")
|
|
255
230
|
return nil unless base
|
|
256
231
|
|
|
257
232
|
URI.join(base, href).to_s
|
data/lib/archaeo/version.rb
CHANGED
data/lib/archaeo.rb
CHANGED
|
@@ -71,4 +71,7 @@ module Archaeo
|
|
|
71
71
|
autoload :SearchResult, "archaeo/archive_search"
|
|
72
72
|
autoload :LocalRewriter, "archaeo/local_rewriter"
|
|
73
73
|
autoload :LocalRewriteSummary, "archaeo/local_rewriter"
|
|
74
|
+
autoload :CoverageAnalyzer, "archaeo/coverage_analyzer"
|
|
75
|
+
autoload :HealthReport, "archaeo/health_report"
|
|
76
|
+
autoload :HealthDetail, "archaeo/health_report"
|
|
74
77
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: archaeo
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.12
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
@@ -87,11 +87,13 @@ files:
|
|
|
87
87
|
- lib/archaeo/color_output.rb
|
|
88
88
|
- lib/archaeo/configuration.rb
|
|
89
89
|
- lib/archaeo/content_tracker.rb
|
|
90
|
+
- lib/archaeo/coverage_analyzer.rb
|
|
90
91
|
- lib/archaeo/coverage_report.rb
|
|
91
92
|
- lib/archaeo/download_scheduler.rb
|
|
92
93
|
- lib/archaeo/download_state.rb
|
|
93
94
|
- lib/archaeo/encoding_detector.rb
|
|
94
95
|
- lib/archaeo/fetcher.rb
|
|
96
|
+
- lib/archaeo/health_report.rb
|
|
95
97
|
- lib/archaeo/http_client.rb
|
|
96
98
|
- lib/archaeo/local_rewriter.rb
|
|
97
99
|
- lib/archaeo/page.rb
|