archaeo 0.2.11 → 0.2.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 76a36571f0747712c2abda1a4aef93c7ade9a83b42590e23f0148b89138451b0
4
- data.tar.gz: a9eed4768d084756fbb10eda17b1f2098246fd56a93cbe91b55f693850e5008a
3
+ metadata.gz: 5c83d2255fd16d4af732415985d1dcc48ca84711221e4b893838124252ffc062
4
+ data.tar.gz: '08c5a7e88bfcae8db40e47f7c1aeac2d4930f9354a94d50af2fd9d6da7e5846c'
5
5
  SHA512:
6
- metadata.gz: fa8e01a6aa31aa678a17ce2fc4f59e324c4e8779716b7c41d876dbd366af06dda30296af446919eedc3136efe5bc2527abef60d5aa4274745e94ef7415a775fa
7
- data.tar.gz: b3fd25ec4d3b10c759992226dd2d699276dbd9def9318ef343f632b69faa5c4fb0017f78ae7aa87b1b85c4fb48a642d4d667c3f43e0e768162486d63f1bf7be1
6
+ metadata.gz: 5c264ca7c5cfb256929599426c734927573f6ea8836fd43f1d7524765aa060aee8a7c0fec3a8dd3a9bbc01f8ab441d52428f4245ca66ff02a75c8e8ef1341e38
7
+ data.tar.gz: 35c17304038f91f8bd898a8076efd840faa5a35a46cac17e73c46bc40c44f5a4d04ca94093aef0226be27f53afb4deb5754a9ab50944ec76c80e9cc791c30830
@@ -1,20 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Archaeo
4
- # Verifies that archived snapshots are still accessible.
5
- #
6
- # Checks each snapshot by performing HEAD requests to the
7
- # archive URL and reporting accessibility status.
8
- HealthReport = Struct.new(
9
- :total, :accessible, :missing, :errors, :details,
10
- keyword_init: true
11
- )
12
-
13
- HealthDetail = Struct.new(
14
- :snapshot, :status, :error,
15
- keyword_init: true
16
- )
17
-
18
4
  class ArchiveHealthCheck
19
5
  def initialize(client: HttpClient.new, cdx_api: nil)
20
6
  @client = client
@@ -245,9 +245,6 @@ module Archaeo
245
245
  page = fetch_page(snapshot)
246
246
  validate_page_status(page, snapshot)
247
247
  write_page_file(page, snapshot)
248
- rescue StandardError
249
- FileUtils.rm_f(tmp_path) if defined?(tmp_path)
250
- raise
251
248
  end
252
249
 
253
250
  def fetch_page(snapshot)
@@ -272,6 +269,9 @@ module Archaeo
272
269
  File.binwrite(tmp_path, page.content)
273
270
  File.rename(tmp_path, filename)
274
271
  page.content
272
+ rescue StandardError
273
+ FileUtils.rm_f(tmp_path) if defined?(tmp_path)
274
+ raise
275
275
  end
276
276
 
277
277
  EXTENSION_MAP = {
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Builds a CoverageReport from CDX snapshot data.
5
+ class CoverageAnalyzer
6
+ def initialize(cdx_api: nil)
7
+ @cdx_api = cdx_api
8
+ end
9
+
10
+ def analyze(url, from: nil, to: nil)
11
+ cdx = @cdx_api || CdxApi.new
12
+ snapshots = cdx.snapshots(url, from: from, to: to).to_a
13
+
14
+ unique_urls = snapshots.map(&:original_url).uniq
15
+ status_dist = compute_status_distribution(snapshots)
16
+ gaps = compute_temporal_gaps(snapshots)
17
+
18
+ CoverageReport.new(
19
+ url: url,
20
+ total_urls: unique_urls.size,
21
+ archived_urls: snapshots.count(&:success?),
22
+ status_distribution: status_dist,
23
+ temporal_gaps: gaps,
24
+ )
25
+ end
26
+
27
+ private
28
+
29
+ def compute_status_distribution(snapshots)
30
+ snapshots.each_with_object(Hash.new(0)) do |snap, counts|
31
+ counts[snap.status_code] += 1
32
+ end
33
+ end
34
+
35
+ def compute_temporal_gaps(snapshots)
36
+ return [] if snapshots.size < 2
37
+
38
+ sorted = snapshots.sort_by(&:timestamp)
39
+ gaps = []
40
+ sorted.each_cons(2) do |a, b|
41
+ diff_days = (b.timestamp.to_time - a.timestamp.to_time) / 86400
42
+ next unless diff_days > 30
43
+
44
+ gaps << { from: a.timestamp.to_s, to: b.timestamp.to_s,
45
+ gap_days: diff_days.round }
46
+ end
47
+ gaps
48
+ end
49
+ end
50
+ end
@@ -51,51 +51,4 @@ module Archaeo
51
51
  to_h
52
52
  end
53
53
  end
54
-
55
- # Builds a CoverageReport from CDX snapshot data.
56
- class CoverageAnalyzer
57
- def initialize(cdx_api: nil)
58
- @cdx_api = cdx_api
59
- end
60
-
61
- def analyze(url, from: nil, to: nil)
62
- cdx = @cdx_api || CdxApi.new
63
- snapshots = cdx.snapshots(url, from: from, to: to).to_a
64
-
65
- unique_urls = snapshots.map(&:original_url).uniq
66
- status_dist = compute_status_distribution(snapshots)
67
- gaps = compute_temporal_gaps(snapshots)
68
-
69
- CoverageReport.new(
70
- url: url,
71
- total_urls: unique_urls.size,
72
- archived_urls: snapshots.count(&:success?),
73
- status_distribution: status_dist,
74
- temporal_gaps: gaps,
75
- )
76
- end
77
-
78
- private
79
-
80
- def compute_status_distribution(snapshots)
81
- snapshots.each_with_object(Hash.new(0)) do |snap, counts|
82
- counts[snap.status_code] += 1
83
- end
84
- end
85
-
86
- def compute_temporal_gaps(snapshots)
87
- return [] if snapshots.size < 2
88
-
89
- sorted = snapshots.sort_by(&:timestamp)
90
- gaps = []
91
- sorted.each_cons(2) do |a, b|
92
- diff_days = (b.timestamp.to_time - a.timestamp.to_time) / 86400
93
- next unless diff_days > 30
94
-
95
- gaps << { from: a.timestamp.to_s, to: b.timestamp.to_s,
96
- gap_days: diff_days.round }
97
- end
98
- gaps
99
- end
100
- end
101
54
  end
@@ -73,18 +73,15 @@ module Archaeo
73
73
  def stale_entries(base_dir: @output_dir)
74
74
  @mutex.synchronize do
75
75
  entries.reject do |e|
76
- find_file(base_dir,
77
- e["ts"]) && File.exist?(find_file(base_dir, e["ts"]))
76
+ path = find_file(base_dir, e["ts"])
77
+ path && File.exist?(path)
78
78
  end
79
79
  end
80
80
  end
81
81
 
82
82
  def cleanup_stale(base_dir: @output_dir)
83
83
  @mutex.synchronize do
84
- stale = entries.reject do |e|
85
- path = find_file(base_dir, e["ts"])
86
- path && File.exist?(path)
87
- end
84
+ stale = stale_entries(base_dir: base_dir)
88
85
  @entries = entries - stale
89
86
  @entries_key = nil
90
87
  save
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ HealthReport = Struct.new(
5
+ :total, :accessible, :missing, :errors, :details,
6
+ keyword_init: true
7
+ )
8
+
9
+ HealthDetail = Struct.new(
10
+ :snapshot, :status, :error,
11
+ keyword_init: true
12
+ )
13
+ end
data/lib/archaeo/page.rb CHANGED
@@ -59,21 +59,15 @@ module Archaeo
59
59
  end
60
60
 
61
61
  def title
62
- @title ||= begin
63
- doc = Nokogiri::HTML(@raw_content)
64
- doc.at_css("title")&.text&.strip
65
- rescue StandardError
66
- nil
67
- end
62
+ @title ||= html_doc.at_css("title")&.text&.strip
68
63
  end
69
64
 
70
65
  def links
71
66
  return [] unless html?
72
67
 
73
68
  @links ||= begin
74
- doc = Nokogiri::HTML(@raw_content)
75
69
  base = @archive_url || @original_url
76
- doc.css("a[href]").map do |anchor|
70
+ html_doc.css("a[href]").map do |anchor|
77
71
  href = resolve_page_url(anchor["href"], base)
78
72
  { href: href, text: anchor.text.strip,
79
73
  external: href && !href.include?(original_domain) }
@@ -85,9 +79,8 @@ module Archaeo
85
79
  return {} unless html?
86
80
 
87
81
  @meta_tags ||= begin
88
- doc = Nokogiri::HTML(@raw_content)
89
- result = extract_meta_entries(doc)
90
- canonical = doc.at_css('link[rel="canonical"]')
82
+ result = extract_meta_entries(html_doc)
83
+ canonical = html_doc.at_css('link[rel="canonical"]')
91
84
  result["canonical"] = canonical["href"].to_s if canonical
92
85
  result
93
86
  end
@@ -96,46 +89,34 @@ module Archaeo
96
89
  def headings
97
90
  return [] unless html?
98
91
 
99
- @headings ||= begin
100
- doc = Nokogiri::HTML(@raw_content)
101
- doc.css("h1, h2, h3, h4, h5, h6").map do |el|
102
- { level: el.name[1].to_i, text: el.text.strip }
103
- end
92
+ @headings ||= html_doc.css("h1, h2, h3, h4, h5, h6").map do |el|
93
+ { level: el.name[1].to_i, text: el.text.strip }
104
94
  end
105
95
  end
106
96
 
107
97
  def images
108
98
  return [] unless html?
109
99
 
110
- @images ||= begin
111
- doc = Nokogiri::HTML(@raw_content)
112
- doc.css("img[src]").map do |el|
113
- { src: el["src"], alt: el["alt"].to_s,
114
- width: el["width"]&.to_i, height: el["height"]&.to_i }
115
- end
100
+ @images ||= html_doc.css("img[src]").map do |el|
101
+ { src: el["src"], alt: el["alt"].to_s,
102
+ width: el["width"]&.to_i, height: el["height"]&.to_i }
116
103
  end
117
104
  end
118
105
 
119
106
  def forms
120
107
  return [] unless html?
121
108
 
122
- @forms ||= begin
123
- doc = Nokogiri::HTML(@raw_content)
124
- doc.css("form").map do |form|
125
- { action: form["action"].to_s, method: (form["method"] || "GET").upcase,
126
- fields: extract_form_fields(form) }
127
- end
109
+ @forms ||= html_doc.css("form").map do |form|
110
+ { action: form["action"].to_s, method: (form["method"] || "GET").upcase,
111
+ fields: extract_form_fields(form) }
128
112
  end
129
113
  end
130
114
 
131
115
  def scripts
132
116
  return [] unless html?
133
117
 
134
- @scripts ||= begin
135
- doc = Nokogiri::HTML(@raw_content)
136
- doc.css("script").map do |el|
137
- { src: el["src"].to_s, type: el["type"].to_s }
138
- end
118
+ @scripts ||= html_doc.css("script").map do |el|
119
+ { src: el["src"].to_s, type: el["type"].to_s }
139
120
  end
140
121
  end
141
122
 
@@ -143,8 +124,7 @@ module Archaeo
143
124
  return [] unless html?
144
125
 
145
126
  @microposts ||= begin
146
- doc = Nokogiri::HTML(@raw_content)
147
- containers = find_article_containers(doc)
127
+ containers = find_article_containers(html_doc)
148
128
  containers.filter_map { |el| extract_micropost(el) }
149
129
  end
150
130
  end
@@ -162,15 +142,7 @@ module Archaeo
162
142
  end
163
143
 
164
144
  def as_json(*)
165
- {
166
- content_type: @content_type,
167
- status_code: @status_code,
168
- archive_url: @archive_url,
169
- original_url: @original_url,
170
- timestamp: @timestamp.to_s,
171
- size: size,
172
- encoding: encoding.to_s,
173
- }
145
+ to_h.transform_values { |v| v.is_a?(Timestamp) ? v.to_s : v }
174
146
  end
175
147
 
176
148
  def inspect
@@ -179,6 +151,10 @@ module Archaeo
179
151
 
180
152
  private
181
153
 
154
+ def html_doc
155
+ @html_doc ||= Nokogiri::HTML(@raw_content)
156
+ end
157
+
182
158
  def detect_encoding
183
159
  charset = extract_charset(@content_type)
184
160
  return Encoding.find(charset) if charset
@@ -199,11 +175,10 @@ module Archaeo
199
175
  end
200
176
 
201
177
  def detect_html_charset
202
- doc = Nokogiri::HTML(@raw_content)
203
- node = doc.at_css("meta[charset]")
178
+ node = html_doc.at_css("meta[charset]")
204
179
  return node["charset"] if node
205
180
 
206
- content = doc.at_css('meta[http-equiv="Content-Type"]')&.[]("content")
181
+ content = html_doc.at_css('meta[http-equiv="Content-Type"]')&.[]("content")
207
182
  return nil unless content
208
183
 
209
184
  match = content.match(/charset=([^\s;]+)/i)
@@ -250,8 +225,8 @@ module Archaeo
250
225
 
251
226
  def resolve_page_url(href, base)
252
227
  return href unless href
253
- return href if href.start_with?("http", "//", "data:", "#",
254
- "javascript:")
228
+ return href if href.start_with?("http:", "https:", "//", "data:",
229
+ "#", "javascript:")
255
230
  return nil unless base
256
231
 
257
232
  URI.join(base, href).to_s
@@ -104,7 +104,6 @@ module Archaeo
104
104
  end
105
105
 
106
106
  def count_elements(page)
107
- require "nokogiri"
108
107
  doc = Nokogiri::HTML(page.content)
109
108
  counts = Hash.new(0)
110
109
  doc.css("*").each { |el| counts[el.name] += 1 }
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Archaeo
4
- VERSION = "0.2.11"
4
+ VERSION = "0.2.12"
5
5
  end
data/lib/archaeo.rb CHANGED
@@ -71,4 +71,7 @@ module Archaeo
71
71
  autoload :SearchResult, "archaeo/archive_search"
72
72
  autoload :LocalRewriter, "archaeo/local_rewriter"
73
73
  autoload :LocalRewriteSummary, "archaeo/local_rewriter"
74
+ autoload :CoverageAnalyzer, "archaeo/coverage_analyzer"
75
+ autoload :HealthReport, "archaeo/health_report"
76
+ autoload :HealthDetail, "archaeo/health_report"
74
77
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: archaeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.11
4
+ version: 0.2.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
@@ -87,11 +87,13 @@ files:
87
87
  - lib/archaeo/color_output.rb
88
88
  - lib/archaeo/configuration.rb
89
89
  - lib/archaeo/content_tracker.rb
90
+ - lib/archaeo/coverage_analyzer.rb
90
91
  - lib/archaeo/coverage_report.rb
91
92
  - lib/archaeo/download_scheduler.rb
92
93
  - lib/archaeo/download_state.rb
93
94
  - lib/archaeo/encoding_detector.rb
94
95
  - lib/archaeo/fetcher.rb
96
+ - lib/archaeo/health_report.rb
95
97
  - lib/archaeo/http_client.rb
96
98
  - lib/archaeo/local_rewriter.rb
97
99
  - lib/archaeo/page.rb