wayback_machine_downloader 0.1.11 → 0.1.12

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6f92d7e391faeb42397b9977342485cb66f63fda
4
- data.tar.gz: a9c72cc1304cd28226f6fcd111cba6398ab3f5b0
3
+ metadata.gz: d6580398142a2bde95000f07ef0063927b969676
4
+ data.tar.gz: 029c7e07d34b4c2b3b7d0c85c0567d495c6bd4ed
5
5
  SHA512:
6
- metadata.gz: 700e7774978a1b2f02eac2485126fe9e8f900a741a45c1d30ab6a89c890029c7d04ec59938147181392a24b663c43cbdd878865a553dfbd5a430abaaa2d8899a
7
- data.tar.gz: 17499b71152c831974e0c3d5afb737d3a251e1d5d446775cccfc0179a618a98f49b7aadf570b22fe628f6b83c6b0d87100d71df2f778d47c4b5c1a58145f787d
6
+ metadata.gz: c1687ba78e8a13d0553dfbba528f36b5cfdfc089289450720ef10890f072b7d7bc31170810fde87825a4a9f0f9bcee0f6f3e5dbfd835f737c5b6edaecdfcf31d
7
+ data.tar.gz: 8683d2a3cfdaae90039fdb117f0ed8776fcaad8ea919fed0ac2bbaae98ed6c42403f3488d73afa0e73b3285aa4b5628ba99a7d60a98979f5a2379207013f5ce9
@@ -4,7 +4,7 @@ require_relative 'wayback_machine_downloader/tidy_bytes'
4
4
 
5
5
  class WaybackMachineDownloader
6
6
 
7
- VERSION = "0.1.11"
7
+ VERSION = "0.1.12"
8
8
 
9
9
  attr_accessor :base_url, :timestamp
10
10
 
@@ -0,0 +1,122 @@
1
+ module TibyBytes
2
+
3
+ # CP-1252 decimal byte => UTF-8 approximation as an array of bytes
4
+ CP1252 = {
5
+ 128 => [226, 130, 172],
6
+ 129 => nil,
7
+ 130 => [226, 128, 154],
8
+ 131 => [198, 146],
9
+ 132 => [226, 128, 158],
10
+ 133 => [226, 128, 166],
11
+ 134 => [226, 128, 160],
12
+ 135 => [226, 128, 161],
13
+ 136 => [203, 134],
14
+ 137 => [226, 128, 176],
15
+ 138 => [197, 160],
16
+ 139 => [226, 128, 185],
17
+ 140 => [197, 146],
18
+ 141 => nil,
19
+ 142 => [197, 189],
20
+ 143 => nil,
21
+ 144 => nil,
22
+ 145 => [226, 128, 152],
23
+ 146 => [226, 128, 153],
24
+ 147 => [226, 128, 156],
25
+ 148 => [226, 128, 157],
26
+ 149 => [226, 128, 162],
27
+ 150 => [226, 128, 147],
28
+ 151 => [226, 128, 148],
29
+ 152 => [203, 156],
30
+ 153 => [226, 132, 162],
31
+ 154 => [197, 161],
32
+ 155 => [226, 128, 186],
33
+ 156 => [197, 147],
34
+ 157 => nil,
35
+ 158 => [197, 190],
36
+ 159 => [197, 184]
37
+ }
38
+
39
+ module StringMixin
40
+
41
+ # Attempt to replace invalid UTF-8 bytes with valid ones. This method
42
+ # naively assumes if you have invalid UTF8 bytes, they are either Windows
43
+ # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
44
+ # always work.
45
+ #
46
+ # Passing +true+ will forcibly tidy all bytes, assuming that the string's
47
+ # encoding is CP-1252 or ISO-8859-1.
48
+ def tidy_bytes(force = false)
49
+
50
+ if force
51
+ return unpack("C*").map do |b|
52
+ tidy_byte(b)
53
+ end.flatten.compact.pack("C*").unpack("U*").pack("U*")
54
+ end
55
+
56
+ bytes = unpack("C*")
57
+ conts_expected = 0
58
+ last_lead = 0
59
+
60
+ bytes.each_index do |i|
61
+
62
+ byte = bytes[i]
63
+ is_ascii = byte < 128
64
+ is_cont = byte > 127 && byte < 192
65
+ is_lead = byte > 191 && byte < 245
66
+ is_unused = byte > 240
67
+ is_restricted = byte > 244
68
+
69
+ # Impossible or highly unlikely byte? Clean it.
70
+ if is_unused || is_restricted
71
+ bytes[i] = tidy_byte(byte)
72
+ elsif is_cont
73
+ # Not expecting contination byte? Clean up. Otherwise, now expect one less.
74
+ conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
75
+ else
76
+ if conts_expected > 0
77
+ # Expected continuation, but got ASCII or leading? Clean backwards up to
78
+ # the leading byte.
79
+ begin
80
+ (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
81
+ rescue NoMethodError => e
82
+ next
83
+ end
84
+ conts_expected = 0
85
+ end
86
+ if is_lead
87
+ # Final byte is leading? Clean it.
88
+ if i == bytes.length - 1
89
+ bytes[i] = tidy_byte(bytes.last)
90
+ else
91
+ # Valid leading byte? Expect continuations determined by position of
92
+ # first zero bit, with max of 3.
93
+ conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
94
+ last_lead = i
95
+ end
96
+ end
97
+ end
98
+ end
99
+ begin
100
+ bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
101
+ rescue ArgumentError => e
102
+ nil
103
+ end
104
+ end
105
+
106
+ # Tidy bytes in-place.
107
+ def tidy_bytes!(force = false)
108
+ replace tidy_bytes(force)
109
+ end
110
+
111
+ private
112
+
113
+ def tidy_byte(byte)
114
+ byte < 160 ? TibyBytes::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
115
+ end
116
+
117
+ end
118
+ end
119
+
120
+ class String
121
+ include TibyBytes::StringMixin
122
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.11
4
+ version: 0.1.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
@@ -49,6 +49,7 @@ extra_rdoc_files: []
49
49
  files:
50
50
  - bin/wayback_machine_downloader
51
51
  - lib/wayback_machine_downloader.rb
52
+ - lib/wayback_machine_downloader/tidy_bytes.rb
52
53
  homepage: https://github.com/hartator/wayback-machine-downloader
53
54
  licenses:
54
55
  - MIT