wayback_machine_downloader 0.1.11 → 0.1.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/wayback_machine_downloader.rb +1 -1
- data/lib/wayback_machine_downloader/tidy_bytes.rb +122 -0
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d6580398142a2bde95000f07ef0063927b969676
|
4
|
+
data.tar.gz: 029c7e07d34b4c2b3b7d0c85c0567d495c6bd4ed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c1687ba78e8a13d0553dfbba528f36b5cfdfc089289450720ef10890f072b7d7bc31170810fde87825a4a9f0f9bcee0f6f3e5dbfd835f737c5b6edaecdfcf31d
|
7
|
+
data.tar.gz: 8683d2a3cfdaae90039fdb117f0ed8776fcaad8ea919fed0ac2bbaae98ed6c42403f3488d73afa0e73b3285aa4b5628ba99a7d60a98979f5a2379207013f5ce9
|
@@ -0,0 +1,122 @@
|
|
1
|
+
module TibyBytes
|
2
|
+
|
3
|
+
# CP-1252 decimal byte => UTF-8 approximation as an array of bytes
|
4
|
+
CP1252 = {
|
5
|
+
128 => [226, 130, 172],
|
6
|
+
129 => nil,
|
7
|
+
130 => [226, 128, 154],
|
8
|
+
131 => [198, 146],
|
9
|
+
132 => [226, 128, 158],
|
10
|
+
133 => [226, 128, 166],
|
11
|
+
134 => [226, 128, 160],
|
12
|
+
135 => [226, 128, 161],
|
13
|
+
136 => [203, 134],
|
14
|
+
137 => [226, 128, 176],
|
15
|
+
138 => [197, 160],
|
16
|
+
139 => [226, 128, 185],
|
17
|
+
140 => [197, 146],
|
18
|
+
141 => nil,
|
19
|
+
142 => [197, 189],
|
20
|
+
143 => nil,
|
21
|
+
144 => nil,
|
22
|
+
145 => [226, 128, 152],
|
23
|
+
146 => [226, 128, 153],
|
24
|
+
147 => [226, 128, 156],
|
25
|
+
148 => [226, 128, 157],
|
26
|
+
149 => [226, 128, 162],
|
27
|
+
150 => [226, 128, 147],
|
28
|
+
151 => [226, 128, 148],
|
29
|
+
152 => [203, 156],
|
30
|
+
153 => [226, 132, 162],
|
31
|
+
154 => [197, 161],
|
32
|
+
155 => [226, 128, 186],
|
33
|
+
156 => [197, 147],
|
34
|
+
157 => nil,
|
35
|
+
158 => [197, 190],
|
36
|
+
159 => [197, 184]
|
37
|
+
}
|
38
|
+
|
39
|
+
module StringMixin
|
40
|
+
|
41
|
+
# Attempt to replace invalid UTF-8 bytes with valid ones. This method
|
42
|
+
# naively assumes if you have invalid UTF8 bytes, they are either Windows
|
43
|
+
# CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
|
44
|
+
# always work.
|
45
|
+
#
|
46
|
+
# Passing +true+ will forcibly tidy all bytes, assuming that the string's
|
47
|
+
# encoding is CP-1252 or ISO-8859-1.
|
48
|
+
def tidy_bytes(force = false)
|
49
|
+
|
50
|
+
if force
|
51
|
+
return unpack("C*").map do |b|
|
52
|
+
tidy_byte(b)
|
53
|
+
end.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
54
|
+
end
|
55
|
+
|
56
|
+
bytes = unpack("C*")
|
57
|
+
conts_expected = 0
|
58
|
+
last_lead = 0
|
59
|
+
|
60
|
+
bytes.each_index do |i|
|
61
|
+
|
62
|
+
byte = bytes[i]
|
63
|
+
is_ascii = byte < 128
|
64
|
+
is_cont = byte > 127 && byte < 192
|
65
|
+
is_lead = byte > 191 && byte < 245
|
66
|
+
is_unused = byte > 240
|
67
|
+
is_restricted = byte > 244
|
68
|
+
|
69
|
+
# Impossible or highly unlikely byte? Clean it.
|
70
|
+
if is_unused || is_restricted
|
71
|
+
bytes[i] = tidy_byte(byte)
|
72
|
+
elsif is_cont
|
73
|
+
# Not expecting contination byte? Clean up. Otherwise, now expect one less.
|
74
|
+
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
75
|
+
else
|
76
|
+
if conts_expected > 0
|
77
|
+
# Expected continuation, but got ASCII or leading? Clean backwards up to
|
78
|
+
# the leading byte.
|
79
|
+
begin
|
80
|
+
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
81
|
+
rescue NoMethodError => e
|
82
|
+
next
|
83
|
+
end
|
84
|
+
conts_expected = 0
|
85
|
+
end
|
86
|
+
if is_lead
|
87
|
+
# Final byte is leading? Clean it.
|
88
|
+
if i == bytes.length - 1
|
89
|
+
bytes[i] = tidy_byte(bytes.last)
|
90
|
+
else
|
91
|
+
# Valid leading byte? Expect continuations determined by position of
|
92
|
+
# first zero bit, with max of 3.
|
93
|
+
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
|
94
|
+
last_lead = i
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
begin
|
100
|
+
bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
101
|
+
rescue ArgumentError => e
|
102
|
+
nil
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# Tidy bytes in-place.
|
107
|
+
def tidy_bytes!(force = false)
|
108
|
+
replace tidy_bytes(force)
|
109
|
+
end
|
110
|
+
|
111
|
+
private
|
112
|
+
|
113
|
+
def tidy_byte(byte)
|
114
|
+
byte < 160 ? TibyBytes::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
class String
|
121
|
+
include TibyBytes::StringMixin
|
122
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
@@ -49,6 +49,7 @@ extra_rdoc_files: []
|
|
49
49
|
files:
|
50
50
|
- bin/wayback_machine_downloader
|
51
51
|
- lib/wayback_machine_downloader.rb
|
52
|
+
- lib/wayback_machine_downloader/tidy_bytes.rb
|
52
53
|
homepage: https://github.com/hartator/wayback-machine-downloader
|
53
54
|
licenses:
|
54
55
|
- MIT
|