tmx_importer 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 879c5a494ffdf3380ca31ffc402dc031261db2bb
4
- data.tar.gz: 09cd7465ac1b8cb93fc21edc411cba48ca20ae7d
3
+ metadata.gz: 9171f015c2f0e45c3772d64d0f98e53949abf89a
4
+ data.tar.gz: f227573b58faf7b215695d9a72ff12df051bd6e9
5
5
  SHA512:
6
- metadata.gz: a1e9252071f93e9a8a65f811a931e7c0aef4f914b7709d28780be54f69f543508773ac2fe379d2c67587abcaa2ce074b41bb9e1eceb628e6f61c432c5c9f0d80
7
- data.tar.gz: 7e8e76da0b91f2ba041e75e7d86d348d6ad1fe66f03506173e0535d6e0395f9e12a0bd04df9699e4141a931bc1da1aad74498af37c54e0fdd7bba1bc4a30c7c9
6
+ metadata.gz: b9dd9c281a0453d994244f87aeaf466c46aa98336a3bae7a598344df23bb01b66779d235f305533e421f04b1a6ca1a695745a28b7311abb18ed3563b724478a8
7
+ data.tar.gz: 8d61b7d51f8b6ed56c21b68639f52aaa3c3f9ed1e6ee158cf1ecfed7669c77cdafb65945e718d36228545d281e9b385ed416639e0413ecfee258ed2922ee43b1
@@ -1,3 +1,3 @@
1
1
  module TmxImporter
2
- VERSION = "0.2.1"
2
+ VERSION = "0.3.0"
3
3
  end
data/lib/tmx_importer.rb CHANGED
@@ -3,9 +3,6 @@ require 'xml'
3
3
  require 'open-uri'
4
4
  require 'pretty_strings'
5
5
 
6
- Encoding.default_internal = Encoding::UTF_8
7
- Encoding.default_external = Encoding::UTF_8
8
-
9
6
  module TmxImporter
10
7
  class Tmx
11
8
  attr_reader :file_path, :encoding
@@ -18,6 +15,12 @@ module TmxImporter
18
15
  seg: { lang: "", counter: 0, vals: [], role: "" },
19
16
  language_pairs: []
20
17
  }
18
+ @src_regex = Regexp.new('(?<=srclang=\S)\S+(?=")|(?=\')'.encode(@encoding))
19
+ @src_string = 'srclang='.encode(@encoding).freeze
20
+ @tu_regex = Regexp.new('<\/tu>'.encode(@encoding))
21
+ @seg_regex = Regexp.new('<\/seg>'.encode(@encoding))
22
+ @lang_regex = Regexp.new('(?<=[^cn]lang=\S)\S+(?=")|(?=\')'.encode(@encoding))
23
+ @lang_string = 'lang'.encode(@encoding).freeze
21
24
  raise "Encoding type not supported. Please choose an encoding of UTF-8, UTF-16LE, or UTF-16BE" unless @encoding.eql?('UTF-8') || @encoding.eql?('UTF-16LE') || @encoding.eql?('UTF-16BE')
22
25
  end
23
26
 
@@ -43,11 +46,12 @@ module TmxImporter
43
46
  end
44
47
 
45
48
  def analyze_line(line)
46
- @doc[:source_language] = line.scan(/(?<=srclang=\S)\S+(?=")|(?=')/)[0] if line.include?('srclang=')
47
- @doc[:tu][:counter] += line.scan(/<\/tu>/).count
48
- @doc[:seg][:counter] += line.scan(/<\/seg>/).count
49
- if line.include?('lang')
50
- @doc[:seg][:lang] = line.scan(/(?<=[^cn]lang=\S)\S+(?=")|(?=')/)[0]
49
+ @doc[:source_language] = line.scan(@src_regex)[0].encode('UTF-8') if line.include?(@src_string)
50
+ @doc[:tu][:counter] += line.scan(@tu_regex).count
51
+ @doc[:seg][:counter] += line.scan(@seg_regex).count
52
+ if line.include?(@lang_string)
53
+ @doc[:seg][:lang] = line.scan(@lang_regex)[0]
54
+ @doc[:seg][:lang] = @doc[:seg][:lang].encode('UTF-8') unless @doc[:seg][:lang].nil?
51
55
  write_language_pair
52
56
  end
53
57
  end
@@ -105,7 +109,7 @@ module TmxImporter
105
109
  if @doc[:seg][:lang] != @doc[:source_language] &&
106
110
  @doc[:seg][:lang].split('-')[0].downcase != @doc[:source_language].split('-')[0].downcase &&
107
111
  @doc[:source_language] != '*all*'
108
- @doc[:language_pairs] << [@doc[:source_language], @doc[:seg][:lang]]
112
+ @doc[:language_pairs] << [@doc[:source_language].force_encoding("UTF-8"), @doc[:seg][:lang].force_encoding("UTF-8")]
109
113
  @doc[:seg][:role] = 'source'
110
114
  elsif @doc[:source_language] == '*all*'
111
115
  @doc[:source_language] = @doc[:seg][:lang]
@@ -117,7 +121,7 @@ module TmxImporter
117
121
 
118
122
  def write_tu(reader)
119
123
  @doc[:tu][:lang] = reader.get_attribute("srclang")
120
- @doc[:tu][:creation_date] = reader.get_attribute("creationdate").nil? ? DateTime.now.to_s : DateTime.parse(reader.get_attribute("creationdate")).to_s
124
+ @doc[:tu][:creation_date] = reader.get_attribute("creationdate").nil? ? DateTime.now.to_s : DateTime.parse(reader.get_attribute("creationdate").force_encoding('UTF-8')).to_s
121
125
  @doc[:tu][:vals] << [@doc[:tu][:id], @doc[:tu][:creation_date]]
122
126
  end
123
127
 
@@ -67,6 +67,12 @@ describe TmxImporter do
67
67
  tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-8')
68
68
  expect(tmx.stats).to eq({:tu_count=>4, :seg_count=>8, :language_pairs=>[["de-DE", "en-US"]]})
69
69
  end
70
+
71
+ it 'imports a TMX file with UTF-16 LE BOM encoding' do
72
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/strange_encoding.tmx')
73
+ tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-16LE')
74
+ expect(tmx.stats).to eq({:tu_count=>1, :seg_count=>2, :language_pairs=>[["tr", "en"]]})
75
+ end
70
76
  end
71
77
 
72
78
  describe '#import' do
@@ -117,5 +123,11 @@ describe TmxImporter do
117
123
  tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-8')
118
124
  expect(tmx.import[1][2][1]).to eq("target")
119
125
  end
126
+
127
+ it 'imports a TMX file with UTF-16 LE BOM encoding' do
128
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/strange_encoding.tmx')
129
+ tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-16LE')
130
+ expect(tmx.import[1][1][3]).to eq("en")
131
+ end
120
132
  end
121
133
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tmx_importer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
@@ -103,6 +103,7 @@ files:
103
103
  - spec/test_sample_files/multiple_language_pairs.tmx
104
104
  - spec/test_sample_files/out_of_order_segments.tmx
105
105
  - spec/test_sample_files/srclang_all.tmx
106
+ - spec/test_sample_files/strange_encoding.tmx
106
107
  - spec/test_sample_files/test_tm(utf-16LE BOM).tmx
107
108
  - spec/test_sample_files/test_tm(utf-16LE).tmx
108
109
  - spec/test_sample_files/test_tm(utf-8).tmx
@@ -140,6 +141,7 @@ test_files:
140
141
  - spec/test_sample_files/multiple_language_pairs.tmx
141
142
  - spec/test_sample_files/out_of_order_segments.tmx
142
143
  - spec/test_sample_files/srclang_all.tmx
144
+ - spec/test_sample_files/strange_encoding.tmx
143
145
  - spec/test_sample_files/test_tm(utf-16LE BOM).tmx
144
146
  - spec/test_sample_files/test_tm(utf-16LE).tmx
145
147
  - spec/test_sample_files/test_tm(utf-8).tmx