archaeo 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 24c6b37575e8f673a8e6acb7aba38264ac811236cb91e905663914d08a283289
4
- data.tar.gz: 632f36d31ee83b23f727dd8eb8217929d3dad26c94c6948fb0621aff2b937701
3
+ metadata.gz: 8b82c7c8f2e568697056587556ac01bee92f51376af390aa26ed9c76221200d8
4
+ data.tar.gz: 411cf04b07219a02e5b049120bb74528dc17cf5fcb8424acd3c7c5ee69659bea
5
5
  SHA512:
6
- metadata.gz: 33cf0ea6c5317be5aafba988e8652555a8e7a77620e93571a51c1537ae4cdd455e76d170fe78549a5e3068ec82db12138159cb87a31c1aafda5c036a6ca2511e
7
- data.tar.gz: 2c0ae7a6a461913ed475dd3084e495f057db995ebf9ea8a1e6c9595aae74cd6bc65a0cdc6f2b2a3a831d9cc0e45e92e06a13a4f7653e89c0a8bbcd7226207677
6
+ metadata.gz: ec87c66817c1ce3f318e45d5b26221ca44c1958dfa9009fbe1541a1b3bbdb8e4585f6baf66e73c6a839a10d938e189b61dbea43afccf74a6f55476ad8719e18e
7
+ data.tar.gz: 0c33454fa368f009a7383ebc8a6a79f3638615740bd312eaab568c21d90cb20a0c4c6615324b20634f00614021016cf6d464910679fcf61fdf8d663097935cd7
@@ -16,6 +16,7 @@ module Archaeo
16
16
  end
17
17
 
18
18
  def near(url, timestamp: nil)
19
+ url = UrlNormalizer.normalize(url)
19
20
  params = { "url" => url }
20
21
  params["timestamp"] = timestamp.to_s if timestamp
21
22
 
@@ -15,6 +15,7 @@ module Archaeo
15
15
  end
16
16
 
17
17
  def download(url, from: nil, to: nil, resume: false)
18
+ url = UrlNormalizer.normalize(url)
18
19
  FileUtils.mkdir_p(@output_dir)
19
20
  state = DownloadState.new(@output_dir)
20
21
 
@@ -47,6 +47,7 @@ module Archaeo
47
47
  # Returns an Enumerator of Snapshot objects, auto-paginating
48
48
  # via resume key unless an explicit page is requested.
49
49
  def snapshots(url, **options)
50
+ url = UrlNormalizer.normalize(url)
50
51
  validate_options!(options)
51
52
 
52
53
  Enumerator.new do |yielder|
@@ -59,9 +60,15 @@ module Archaeo
59
60
  end
60
61
 
61
62
  def near(url, timestamp:)
63
+ url = UrlNormalizer.normalize(url)
62
64
  ts = Timestamp.coerce(timestamp)
63
65
  result = snapshots(url, sort: "closest",
64
66
  closest: ts.to_s, limit: 1).first
67
+ if result&.blocked?
68
+ raise BlockedSiteError,
69
+ "Site is blocked: #{url}"
70
+ end
71
+
65
72
  result || raise(NoSnapshotFound,
66
73
  "No snapshot found near #{ts} for #{url}")
67
74
  end
@@ -94,6 +101,7 @@ module Archaeo
94
101
 
95
102
  # Returns the number of pages for a paginated query.
96
103
  def num_pages(url, **options)
104
+ url = UrlNormalizer.normalize(url)
97
105
  params = { "url" => url, "showNumPages" => "true" }
98
106
  merge_scalar_params!(params, options)
99
107
  response = @client.get(
@@ -109,6 +117,7 @@ module Archaeo
109
117
 
110
118
  # Returns all unique original URLs under a domain.
111
119
  def known_urls(domain, match_type: "domain")
120
+ domain = UrlNormalizer.normalize(domain)
112
121
  snapshots(domain, match_type: match_type,
113
122
  collapse: ["urlkey"]).map(&:original_url).uniq
114
123
  end
@@ -14,6 +14,7 @@ module Archaeo
14
14
  end
15
15
 
16
16
  def fetch(url, timestamp:, identity: false)
17
+ url = UrlNormalizer.normalize(url)
17
18
  ts = Timestamp.coerce(timestamp)
18
19
  archive_url = ArchiveUrl.new(url, timestamp: ts,
19
20
  identity: identity)
@@ -21,6 +22,13 @@ module Archaeo
21
22
  build_page(response, archive_url.to_s, url, ts)
22
23
  end
23
24
 
25
+ def fetch_page_with_assets(url, timestamp:)
26
+ page = fetch(url, timestamp: timestamp)
27
+ assets = AssetExtractor.new(page.content,
28
+ base_url: page.archive_url).extract
29
+ PageBundle.new(page: page, assets: assets)
30
+ end
31
+
24
32
  private
25
33
 
26
34
  def build_page(response, archive_url, url, timestamp)
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # A fetched page together with all its extracted asset URLs.
5
+ #
6
+ # Bundles a Page with the AssetList discovered from its HTML,
7
+ # providing a single object for complete page archival.
8
+ class PageBundle
9
+ attr_reader :page, :assets
10
+
11
+ def initialize(page:, assets:)
12
+ @page = page
13
+ @assets = assets
14
+ end
15
+ end
16
+ end
@@ -17,6 +17,7 @@ module Archaeo
17
17
  end
18
18
 
19
19
  def save(url)
20
+ url = UrlNormalizer.normalize(url)
20
21
  save_url = "#{ENDPOINT}/#{url}"
21
22
  start_time = Time.now.utc
22
23
  attempt_save(save_url, start_time, url)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Archaeo
4
- VERSION = "0.2.0"
4
+ VERSION = "0.2.1"
5
5
  end
data/lib/archaeo.rb CHANGED
@@ -19,6 +19,7 @@ module Archaeo
19
19
  autoload :ArchiveUrl, "archaeo/archive_url"
20
20
  autoload :Snapshot, "archaeo/snapshot"
21
21
  autoload :Page, "archaeo/page"
22
+ autoload :PageBundle, "archaeo/page_bundle"
22
23
  autoload :SaveResult, "archaeo/save_result"
23
24
  autoload :AvailabilityResult, "archaeo/availability_result"
24
25
  autoload :UrlNormalizer, "archaeo/url_normalizer"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: archaeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
@@ -84,6 +84,7 @@ files:
84
84
  - lib/archaeo/fetcher.rb
85
85
  - lib/archaeo/http_client.rb
86
86
  - lib/archaeo/page.rb
87
+ - lib/archaeo/page_bundle.rb
87
88
  - lib/archaeo/save_api.rb
88
89
  - lib/archaeo/save_result.rb
89
90
  - lib/archaeo/snapshot.rb