wayback_machine_downloader 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6f92d7e391faeb42397b9977342485cb66f63fda
4
- data.tar.gz: a9c72cc1304cd28226f6fcd111cba6398ab3f5b0
3
+ metadata.gz: d6580398142a2bde95000f07ef0063927b969676
4
+ data.tar.gz: 029c7e07d34b4c2b3b7d0c85c0567d495c6bd4ed
5
5
  SHA512:
6
- metadata.gz: 700e7774978a1b2f02eac2485126fe9e8f900a741a45c1d30ab6a89c890029c7d04ec59938147181392a24b663c43cbdd878865a553dfbd5a430abaaa2d8899a
7
- data.tar.gz: 17499b71152c831974e0c3d5afb737d3a251e1d5d446775cccfc0179a618a98f49b7aadf570b22fe628f6b83c6b0d87100d71df2f778d47c4b5c1a58145f787d
6
+ metadata.gz: c1687ba78e8a13d0553dfbba528f36b5cfdfc089289450720ef10890f072b7d7bc31170810fde87825a4a9f0f9bcee0f6f3e5dbfd835f737c5b6edaecdfcf31d
7
+ data.tar.gz: 8683d2a3cfdaae90039fdb117f0ed8776fcaad8ea919fed0ac2bbaae98ed6c42403f3488d73afa0e73b3285aa4b5628ba99a7d60a98979f5a2379207013f5ce9
@@ -4,7 +4,7 @@ require_relative 'wayback_machine_downloader/tidy_bytes'
4
4
 
5
5
  class WaybackMachineDownloader
6
6
 
7
- VERSION = "0.1.11"
7
+ VERSION = "0.1.12"
8
8
 
9
9
  attr_accessor :base_url, :timestamp
10
10
 
@@ -0,0 +1,122 @@
1
+ module TibyBytes
2
+
3
+ # CP-1252 decimal byte => UTF-8 approximation as an array of bytes
4
+ CP1252 = {
5
+ 128 => [226, 130, 172],
6
+ 129 => nil,
7
+ 130 => [226, 128, 154],
8
+ 131 => [198, 146],
9
+ 132 => [226, 128, 158],
10
+ 133 => [226, 128, 166],
11
+ 134 => [226, 128, 160],
12
+ 135 => [226, 128, 161],
13
+ 136 => [203, 134],
14
+ 137 => [226, 128, 176],
15
+ 138 => [197, 160],
16
+ 139 => [226, 128, 185],
17
+ 140 => [197, 146],
18
+ 141 => nil,
19
+ 142 => [197, 189],
20
+ 143 => nil,
21
+ 144 => nil,
22
+ 145 => [226, 128, 152],
23
+ 146 => [226, 128, 153],
24
+ 147 => [226, 128, 156],
25
+ 148 => [226, 128, 157],
26
+ 149 => [226, 128, 162],
27
+ 150 => [226, 128, 147],
28
+ 151 => [226, 128, 148],
29
+ 152 => [203, 156],
30
+ 153 => [226, 132, 162],
31
+ 154 => [197, 161],
32
+ 155 => [226, 128, 186],
33
+ 156 => [197, 147],
34
+ 157 => nil,
35
+ 158 => [197, 190],
36
+ 159 => [197, 184]
37
+ }
38
+
39
+ module StringMixin
40
+
41
+ # Attempt to replace invalid UTF-8 bytes with valid ones. This method
42
+ # naively assumes if you have invalid UTF8 bytes, they are either Windows
43
+ # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
44
+ # always work.
45
+ #
46
+ # Passing +true+ will forcibly tidy all bytes, assuming that the string's
47
+ # encoding is CP-1252 or ISO-8859-1.
48
+ def tidy_bytes(force = false)
49
+
50
+ if force
51
+ return unpack("C*").map do |b|
52
+ tidy_byte(b)
53
+ end.flatten.compact.pack("C*").unpack("U*").pack("U*")
54
+ end
55
+
56
+ bytes = unpack("C*")
57
+ conts_expected = 0
58
+ last_lead = 0
59
+
60
+ bytes.each_index do |i|
61
+
62
+ byte = bytes[i]
63
+ is_ascii = byte < 128
64
+ is_cont = byte > 127 && byte < 192
65
+ is_lead = byte > 191 && byte < 245
66
+ is_unused = byte > 240
67
+ is_restricted = byte > 244
68
+
69
+ # Impossible or highly unlikely byte? Clean it.
70
+ if is_unused || is_restricted
71
+ bytes[i] = tidy_byte(byte)
72
+ elsif is_cont
73
+ # Not expecting contination byte? Clean up. Otherwise, now expect one less.
74
+ conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
75
+ else
76
+ if conts_expected > 0
77
+ # Expected continuation, but got ASCII or leading? Clean backwards up to
78
+ # the leading byte.
79
+ begin
80
+ (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
81
+ rescue NoMethodError => e
82
+ next
83
+ end
84
+ conts_expected = 0
85
+ end
86
+ if is_lead
87
+ # Final byte is leading? Clean it.
88
+ if i == bytes.length - 1
89
+ bytes[i] = tidy_byte(bytes.last)
90
+ else
91
+ # Valid leading byte? Expect continuations determined by position of
92
+ # first zero bit, with max of 3.
93
+ conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
94
+ last_lead = i
95
+ end
96
+ end
97
+ end
98
+ end
99
+ begin
100
+ bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
101
+ rescue ArgumentError => e
102
+ nil
103
+ end
104
+ end
105
+
106
+ # Tidy bytes in-place.
107
+ def tidy_bytes!(force = false)
108
+ replace tidy_bytes(force)
109
+ end
110
+
111
+ private
112
+
113
+ def tidy_byte(byte)
114
+ byte < 160 ? TibyBytes::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
115
+ end
116
+
117
+ end
118
+ end
119
+
120
+ class String
121
+ include TibyBytes::StringMixin
122
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.11
4
+ version: 0.1.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
@@ -49,6 +49,7 @@ extra_rdoc_files: []
49
49
  files:
50
50
  - bin/wayback_machine_downloader
51
51
  - lib/wayback_machine_downloader.rb
52
+ - lib/wayback_machine_downloader/tidy_bytes.rb
52
53
  homepage: https://github.com/hartator/wayback-machine-downloader
53
54
  licenses:
54
55
  - MIT