biblicit 2.0.7 → 2.0.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Biblicit
4
4
 
5
- VERSION = '2.0.7'
5
+ VERSION = '2.0.8'
6
6
 
7
7
  end
@@ -2,6 +2,8 @@
2
2
  # encoding: UTF-8
3
3
  #Author Nguyen Thuy Dung
4
4
  require 'find'
5
+ require "#{File.dirname(__FILE__)}/forceUtf8"
6
+
5
7
  #get relative pos in ingeter, values range from 0-10
6
8
  def getPos (val)
7
9
  if val == 0
@@ -26,28 +28,6 @@ def getHeader(str)
26
28
  return str.downcase
27
29
  end
28
30
 
29
- # Converts a string to UTF-8. If the string is already valid UTF-8, it just
30
- # marks it as such. If the string isn't valid UTF-8, we assume that it's
31
- # ISO-8859-1 or Windows-1252 and convert it. If it's not valid in that encoding
32
- # either, we just strip all non-UTF-8 characters and call it a day.
33
- #
34
- # Destructive!
35
- def force_utf8!(string)
36
- string.force_encoding "UTF-8"
37
- return string if string.valid_encoding?
38
-
39
- begin
40
- string.force_encoding "Windows-1252" # common superset of 8859-1
41
- string.encode! "UTF-8"
42
- rescue Encoding::InvalidByteSequenceError,
43
- Encoding::UndefinedConversionError
44
- string.force_encoding "UTF-8"
45
- string.encode! "UTF-16",
46
- invalid: :replace, undef: :replace, replace: ""
47
- string.encode! "UTF-8"
48
- end
49
- end
50
-
51
31
  f = File.open("#{ARGV[0]}")
52
32
  hea_array = Array.new
53
33
  ahea_array = Array.new
@@ -0,0 +1,24 @@
1
+ # encoding: UTF-8
2
+
3
+ # Converts a string to UTF-8. If the string is already valid UTF-8, it just
4
+ # marks it as such. If the string isn't valid UTF-8, we assume that it's
5
+ # ISO-8859-1 or Windows-1252 and convert it. If it's not valid in that encoding
6
+ # either, we just strip all non-UTF-8 characters and call it a day.
7
+ #
8
+ # Destructive!
9
+ def force_utf8!(string)
10
+ string.force_encoding "UTF-8"
11
+ return string if string.valid_encoding?
12
+
13
+ begin
14
+ string.force_encoding "Windows-1252" # common superset of 8859-1
15
+ string.encode! "UTF-8"
16
+ rescue Encoding::InvalidByteSequenceError,
17
+ Encoding::UndefinedConversionError
18
+ string.force_encoding "UTF-8"
19
+ string.encode! "UTF-16",
20
+ invalid: :replace, undef: :replace, replace: ""
21
+ string.encode! "UTF-8"
22
+ end
23
+ end
24
+
@@ -10,6 +10,8 @@ pwd = File.dirname(__FILE__)
10
10
  @DATA = "#{pwd}/../../resources/sectLabel/"
11
11
  @TEST_DIR = "/tmp/"
12
12
 
13
+ require "#{@SRC}/forceUtf8"
14
+
13
15
  name = "#{Time.now.to_i}-#{Process.pid}"
14
16
 
15
17
  cmd = "ruby #{@SRC}/extractFeature.rb #{ARGV[0]} > #{@TEST_DIR}/#{name}.test"
@@ -28,7 +30,7 @@ end
28
30
 
29
31
  f = File.open("#{@TEST_DIR}/#{name}.out")
30
32
  while !f.eof do
31
- str = f.gets.chomp.strip
33
+ str = force_utf8!(f.gets).chomp.strip
32
34
  if str != ""
33
35
  l = str.split(" ")
34
36
  output = l.at(l.length-1)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: biblicit
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.7
4
+ version: 2.0.8
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -401,6 +401,7 @@ files:
401
401
  - parscit/bin/sectLabel/README.txt
402
402
  - parscit/bin/sectLabel/genericSect/crossValidation.rb
403
403
  - parscit/bin/sectLabel/genericSect/extractFeature.rb
404
+ - parscit/bin/sectLabel/genericSect/forceUtf8.rb
404
405
  - parscit/bin/sectLabel/genericSectExtract.rb
405
406
  - parscit/bin/sectLabel/getStructureInfo.pl
406
407
  - parscit/bin/sectLabel/redo.sectLabel.pl