biblicit 2.0.6 → 2.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Biblicit
4
4
 
5
- VERSION = '2.0.6'
5
+ VERSION = '2.0.7'
6
6
 
7
7
  end
@@ -26,12 +26,33 @@ def getHeader(str)
26
26
  return str.downcase
27
27
  end
28
28
 
29
+ # Converts a string to UTF-8. If the string is already valid UTF-8, it just
30
+ # marks it as such. If the string isn't valid UTF-8, we assume that it's
31
+ # ISO-8859-1 or Windows-1252 and convert it. If it's not valid in that encoding
32
+ # either, we just strip all non-UTF-8 characters and call it a day.
33
+ #
34
+ # Destructive!
35
+ def force_utf8!(string)
36
+ string.force_encoding "UTF-8"
37
+ return string if string.valid_encoding?
38
+
39
+ begin
40
+ string.force_encoding "Windows-1252" # common superset of 8859-1
41
+ string.encode! "UTF-8"
42
+ rescue Encoding::InvalidByteSequenceError,
43
+ Encoding::UndefinedConversionError
44
+ string.force_encoding "UTF-8"
45
+ string.encode! "UTF-16",
46
+ invalid: :replace, undef: :replace, replace: ""
47
+ string.encode! "UTF-8"
48
+ end
49
+ end
29
50
 
30
51
  f = File.open("#{ARGV[0]}")
31
52
  hea_array = Array.new
32
53
  ahea_array = Array.new
33
54
  while !f.eof do
34
- l = f.gets.chomp.strip
55
+ l = force_utf8!(f.gets).chomp.strip
35
56
  if l != ""
36
57
  tmp_array = l.split("|||")
37
58
  if tmp_array.length == 1
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: biblicit
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.6
4
+ version: 2.0.7
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: