biblicit 2.0.6 → 2.0.7
Sign up to get free protection for your applications and to get access to all the features.
data/lib/biblicit/version.rb
CHANGED
@@ -26,12 +26,33 @@ def getHeader(str)
|
|
26
26
|
return str.downcase
|
27
27
|
end
|
28
28
|
|
29
|
+
# Converts a string to UTF-8. If the string is already valid UTF-8, it just
|
30
|
+
# marks it as such. If the string isn't valid UTF-8, we assume that it's
|
31
|
+
# ISO-8859-1 or Windows-1252 and convert it. If it's not valid in that encoding
|
32
|
+
# either, we just strip all non-UTF-8 characters and call it a day.
|
33
|
+
#
|
34
|
+
# Destructive!
|
35
|
+
def force_utf8!(string)
|
36
|
+
string.force_encoding "UTF-8"
|
37
|
+
return string if string.valid_encoding?
|
38
|
+
|
39
|
+
begin
|
40
|
+
string.force_encoding "Windows-1252" # common superset of 8859-1
|
41
|
+
string.encode! "UTF-8"
|
42
|
+
rescue Encoding::InvalidByteSequenceError,
|
43
|
+
Encoding::UndefinedConversionError
|
44
|
+
string.force_encoding "UTF-8"
|
45
|
+
string.encode! "UTF-16",
|
46
|
+
invalid: :replace, undef: :replace, replace: ""
|
47
|
+
string.encode! "UTF-8"
|
48
|
+
end
|
49
|
+
end
|
29
50
|
|
30
51
|
f = File.open("#{ARGV[0]}")
|
31
52
|
hea_array = Array.new
|
32
53
|
ahea_array = Array.new
|
33
54
|
while !f.eof do
|
34
|
-
l = f.gets.chomp.strip
|
55
|
+
l = force_utf8!(f.gets).chomp.strip
|
35
56
|
if l != ""
|
36
57
|
tmp_array = l.split("|||")
|
37
58
|
if tmp_array.length == 1
|