wayback_machine_downloader 0.1.11 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wayback_machine_downloader.rb +1 -1
- data/lib/wayback_machine_downloader/tidy_bytes.rb +122 -0
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d6580398142a2bde95000f07ef0063927b969676
|
4
|
+
data.tar.gz: 029c7e07d34b4c2b3b7d0c85c0567d495c6bd4ed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c1687ba78e8a13d0553dfbba528f36b5cfdfc089289450720ef10890f072b7d7bc31170810fde87825a4a9f0f9bcee0f6f3e5dbfd835f737c5b6edaecdfcf31d
|
7
|
+
data.tar.gz: 8683d2a3cfdaae90039fdb117f0ed8776fcaad8ea919fed0ac2bbaae98ed6c42403f3488d73afa0e73b3285aa4b5628ba99a7d60a98979f5a2379207013f5ce9
|
@@ -0,0 +1,122 @@
|
|
1
|
+
module TibyBytes
|
2
|
+
|
3
|
+
# CP-1252 decimal byte => UTF-8 approximation as an array of bytes
|
4
|
+
CP1252 = {
|
5
|
+
128 => [226, 130, 172],
|
6
|
+
129 => nil,
|
7
|
+
130 => [226, 128, 154],
|
8
|
+
131 => [198, 146],
|
9
|
+
132 => [226, 128, 158],
|
10
|
+
133 => [226, 128, 166],
|
11
|
+
134 => [226, 128, 160],
|
12
|
+
135 => [226, 128, 161],
|
13
|
+
136 => [203, 134],
|
14
|
+
137 => [226, 128, 176],
|
15
|
+
138 => [197, 160],
|
16
|
+
139 => [226, 128, 185],
|
17
|
+
140 => [197, 146],
|
18
|
+
141 => nil,
|
19
|
+
142 => [197, 189],
|
20
|
+
143 => nil,
|
21
|
+
144 => nil,
|
22
|
+
145 => [226, 128, 152],
|
23
|
+
146 => [226, 128, 153],
|
24
|
+
147 => [226, 128, 156],
|
25
|
+
148 => [226, 128, 157],
|
26
|
+
149 => [226, 128, 162],
|
27
|
+
150 => [226, 128, 147],
|
28
|
+
151 => [226, 128, 148],
|
29
|
+
152 => [203, 156],
|
30
|
+
153 => [226, 132, 162],
|
31
|
+
154 => [197, 161],
|
32
|
+
155 => [226, 128, 186],
|
33
|
+
156 => [197, 147],
|
34
|
+
157 => nil,
|
35
|
+
158 => [197, 190],
|
36
|
+
159 => [197, 184]
|
37
|
+
}
|
38
|
+
|
39
|
+
module StringMixin
|
40
|
+
|
41
|
+
# Attempt to replace invalid UTF-8 bytes with valid ones. This method
|
42
|
+
# naively assumes if you have invalid UTF8 bytes, they are either Windows
|
43
|
+
# CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
|
44
|
+
# always work.
|
45
|
+
#
|
46
|
+
# Passing +true+ will forcibly tidy all bytes, assuming that the string's
|
47
|
+
# encoding is CP-1252 or ISO-8859-1.
|
48
|
+
def tidy_bytes(force = false)
|
49
|
+
|
50
|
+
if force
|
51
|
+
return unpack("C*").map do |b|
|
52
|
+
tidy_byte(b)
|
53
|
+
end.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
54
|
+
end
|
55
|
+
|
56
|
+
bytes = unpack("C*")
|
57
|
+
conts_expected = 0
|
58
|
+
last_lead = 0
|
59
|
+
|
60
|
+
bytes.each_index do |i|
|
61
|
+
|
62
|
+
byte = bytes[i]
|
63
|
+
is_ascii = byte < 128
|
64
|
+
is_cont = byte > 127 && byte < 192
|
65
|
+
is_lead = byte > 191 && byte < 245
|
66
|
+
is_unused = byte > 240
|
67
|
+
is_restricted = byte > 244
|
68
|
+
|
69
|
+
# Impossible or highly unlikely byte? Clean it.
|
70
|
+
if is_unused || is_restricted
|
71
|
+
bytes[i] = tidy_byte(byte)
|
72
|
+
elsif is_cont
|
73
|
+
# Not expecting contination byte? Clean up. Otherwise, now expect one less.
|
74
|
+
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
75
|
+
else
|
76
|
+
if conts_expected > 0
|
77
|
+
# Expected continuation, but got ASCII or leading? Clean backwards up to
|
78
|
+
# the leading byte.
|
79
|
+
begin
|
80
|
+
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
81
|
+
rescue NoMethodError => e
|
82
|
+
next
|
83
|
+
end
|
84
|
+
conts_expected = 0
|
85
|
+
end
|
86
|
+
if is_lead
|
87
|
+
# Final byte is leading? Clean it.
|
88
|
+
if i == bytes.length - 1
|
89
|
+
bytes[i] = tidy_byte(bytes.last)
|
90
|
+
else
|
91
|
+
# Valid leading byte? Expect continuations determined by position of
|
92
|
+
# first zero bit, with max of 3.
|
93
|
+
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
|
94
|
+
last_lead = i
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
begin
|
100
|
+
bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
101
|
+
rescue ArgumentError => e
|
102
|
+
nil
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# Tidy bytes in-place.
|
107
|
+
def tidy_bytes!(force = false)
|
108
|
+
replace tidy_bytes(force)
|
109
|
+
end
|
110
|
+
|
111
|
+
private
|
112
|
+
|
113
|
+
def tidy_byte(byte)
|
114
|
+
byte < 160 ? TibyBytes::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
class String
|
121
|
+
include TibyBytes::StringMixin
|
122
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
@@ -49,6 +49,7 @@ extra_rdoc_files: []
|
|
49
49
|
files:
|
50
50
|
- bin/wayback_machine_downloader
|
51
51
|
- lib/wayback_machine_downloader.rb
|
52
|
+
- lib/wayback_machine_downloader/tidy_bytes.rb
|
52
53
|
homepage: https://github.com/hartator/wayback-machine-downloader
|
53
54
|
licenses:
|
54
55
|
- MIT
|