tmx_importer 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 77a255ff1cd88f6d3edbc6748e2680ec86486abf
4
- data.tar.gz: 01001fd8f8c612fffcc83e110b77d9738299e046
3
+ metadata.gz: 07447d45355fb271c832db8228367a0d14d04c5f
4
+ data.tar.gz: ab468fdf9295379cd036b351648f37eb3477468f
5
5
  SHA512:
6
- metadata.gz: da87359b112880242cca91b25005b828017d7563b2915217759576da7a8b222e29cfce2b7df55fd3196075c25ab1168cb3f5de4c4cb73dc25958b1075a9c7a3a
7
- data.tar.gz: ae1aade7c0e33859dd4f6dcceebb5171d345ef5f7dececcd3e5e57f4657f86162ceb752ce0ebc0511b4d38ebccee0e49810c1b745727ac75b1463ea9aa947c89
6
+ metadata.gz: 508a95ae93f7a43af68362b97ab429d5f746486ad2bc2216174d35490b1c9952e8059e5eb111074c05ba40055d1e255d34d23439caae5ebfcebfa6dcceb0116e
7
+ data.tar.gz: 5aa98d1fb3a4ed65554f149c9d8be21fc2a3fe2b9cc02034bb63718ec1a7ec68049da278c9793d036cf450aaf7099ce1799eccc01e554e724d69e52614d11daa
@@ -1,3 +1,3 @@
1
1
  module TmxImporter
2
- VERSION = "0.4.0"
2
+ VERSION = "0.5.0"
3
3
  end
data/lib/tmx_importer.rb CHANGED
@@ -2,8 +2,7 @@ require 'tmx_importer/version'
2
2
  require 'xml'
3
3
  require 'open-uri'
4
4
  require 'pretty_strings'
5
-
6
- Encoding.default_external = Encoding::UTF_8
5
+ require 'charlock_holmes'
7
6
 
8
7
  module TmxImporter
9
8
  class Tmx
@@ -17,20 +16,15 @@ module TmxImporter
17
16
  seg: { lang: "", counter: 0, vals: [], role: "" },
18
17
  language_pairs: []
19
18
  }
20
- @src_regex = Regexp.new('(?<=srclang=\S)\S+(?=")|(?=\')'.encode(@encoding))
21
- @src_string = 'srclang='.encode(@encoding).freeze
22
- @tu_regex = Regexp.new('<\/tu>'.encode(@encoding))
23
- @seg_regex = Regexp.new('<\/seg>'.encode(@encoding))
24
- @lang_regex = Regexp.new('(?<=[^cn]lang=\S)\S+(?=")|(?=\')'.encode(@encoding))
25
- @lang_string = 'lang'.encode(@encoding).freeze
19
+ @text = CharlockHolmes::Converter.convert(File.read(open(@file_path)), encoding, 'UTF-8') if !@encoding.eql?('UTF-8')
26
20
  raise "Encoding type not supported. Please choose an encoding of UTF-8, UTF-16LE, or UTF-16BE" unless @encoding.eql?('UTF-8') || @encoding.eql?('UTF-16LE') || @encoding.eql?('UTF-16BE')
27
21
  end
28
22
 
29
23
  def stats
30
- File.open(@file_path, "rb:#{encoding}") do |file|
31
- file.each do |line|
32
- analyze_line(line)
33
- end
24
+ if encoding.eql?('UTF-8')
25
+ analyze_stats_utf_8
26
+ else
27
+ analyze_stats_utf_16
34
28
  end
35
29
  {tu_count: @doc[:tu][:counter], seg_count: @doc[:seg][:counter], language_pairs: @doc[:language_pairs].uniq}
36
30
  end
@@ -43,29 +37,35 @@ module TmxImporter
43
37
 
44
38
  private
45
39
 
46
- def read_file
47
- XML::Reader.io(open(file_path), options: XML::Parser::Options::NOERROR, encoding: set_encoding)
40
+ def analyze_stats_utf_8
41
+ File.readlines(@file_path).each do |line|
42
+ analyze_line(line)
43
+ end
48
44
  end
49
45
 
50
- def analyze_line(line)
51
- @doc[:source_language] = line.scan(@src_regex)[0].encode('UTF-8') if line.include?(@src_string)
52
- @doc[:tu][:counter] += line.scan(@tu_regex).count
53
- @doc[:seg][:counter] += line.scan(@seg_regex).count
54
- if line.include?(@lang_string)
55
- @doc[:seg][:lang] = line.scan(@lang_regex)[0]
56
- @doc[:seg][:lang] = @doc[:seg][:lang].encode('UTF-8') unless @doc[:seg][:lang].nil?
57
- write_language_pair
46
+ def analyze_stats_utf_16
47
+ @text.each_line do |line|
48
+ analyze_line(line)
58
49
  end
59
50
  end
60
51
 
61
- def set_encoding
62
- case encoding
63
- when 'UTF-8'
64
- xml_encoding = XML::Encoding::UTF_8
65
- when 'UTF-16LE'
66
- xml_encoding = XML::Encoding::UTF_16LE
67
- when 'UTF-16BE'
68
- xml_encoding = XML::Encoding::UTF_16BE
52
+ def read_file
53
+ if encoding.eql?('UTF-8')
54
+ XML::Reader.io(open(file_path), options: XML::Parser::Options::NOERROR, encoding: XML::Encoding::UTF_8)
55
+ else
56
+ reader = @text.gsub!(/(?<=encoding=").*(?=")/, 'utf-8').gsub(/&#x[0-1]?[0-9a-fA-F];/, ' ').gsub(/[\0-\x1f\x7f\u2028]/, ' ')
57
+ XML::Reader.string(reader, options: XML::Parser::Options::NOERROR, encoding: XML::Encoding::UTF_8)
58
+ end
59
+ end
60
+
61
+ def analyze_line(line)
62
+ @doc[:source_language] = line.scan(/(?<=srclang=\S)\S+(?=")|(?=')/)[0] if line.include?('srclang=')
63
+ @doc[:tu][:counter] += line.scan(/<\/tu>/).count
64
+ @doc[:seg][:counter] += line.scan(/<\/seg>/).count
65
+ if line.include?('lang')
66
+ @doc[:seg][:lang] = line.scan(/(?<=[^cn]lang=\S)\S+(?=")|(?=')/)[0]
67
+ @doc[:seg][:lang] = @doc[:seg][:lang] unless @doc[:seg][:lang].nil?
68
+ write_language_pair
69
69
  end
70
70
  end
71
71
 
@@ -92,13 +92,13 @@ module TmxImporter
92
92
  def eval_state_initial(tag_stack, reader)
93
93
  case tag_stack.last.bytes.to_a
94
94
  when [104, 101, 97, 100, 101, 114]
95
- @doc[:source_language] = reader.get_attribute("srclang").force_encoding("UTF-8") if @doc[:source_language].empty? && reader.has_attributes? && reader.get_attribute("srclang")
95
+ @doc[:source_language] = reader.get_attribute("srclang") if @doc[:source_language].empty? && reader.has_attributes? && reader.get_attribute("srclang")
96
96
  when [116, 117]
97
97
  write_tu(reader)
98
98
  @doc[:tu][:counter] += 1
99
99
  when [116, 117, 118]
100
100
  seg_lang = reader.get_attribute("lang") || reader.get_attribute("xml:lang")
101
- @doc[:seg][:lang] = seg_lang.force_encoding("UTF-8") unless seg_lang.empty?
101
+ @doc[:seg][:lang] = seg_lang unless seg_lang.empty?
102
102
  when [115, 101, 103]
103
103
  write_seg(reader)
104
104
  write_language_pair
@@ -111,10 +111,10 @@ module TmxImporter
111
111
  if @doc[:seg][:lang] != @doc[:source_language] &&
112
112
  @doc[:seg][:lang].split('-')[0].downcase != @doc[:source_language].split('-')[0].downcase &&
113
113
  @doc[:source_language] != '*all*'
114
- @doc[:language_pairs] << [@doc[:source_language].force_encoding("UTF-8"), @doc[:seg][:lang].force_encoding("UTF-8")]
114
+ @doc[:language_pairs] << [@doc[:source_language], @doc[:seg][:lang]]
115
115
  @doc[:seg][:role] = 'source'
116
116
  elsif @doc[:source_language] == '*all*'
117
- @doc[:source_language] = @doc[:seg][:lang].force_encoding("UTF-8")
117
+ @doc[:source_language] = @doc[:seg][:lang]
118
118
  @doc[:seg][:role] = 'source'
119
119
  else
120
120
  @doc[:seg][:role] = 'target'
@@ -123,13 +123,13 @@ module TmxImporter
123
123
 
124
124
  def write_tu(reader)
125
125
  @doc[:tu][:lang] = reader.get_attribute("srclang")
126
- @doc[:tu][:creation_date] = reader.get_attribute("creationdate").nil? ? DateTime.now.to_s : DateTime.parse(reader.get_attribute("creationdate").force_encoding('UTF-8')).to_s
126
+ @doc[:tu][:creation_date] = reader.get_attribute("creationdate").nil? ? DateTime.now.to_s : DateTime.parse(reader.get_attribute("creationdate")).to_s
127
127
  @doc[:tu][:vals] << [@doc[:tu][:id], @doc[:tu][:creation_date]]
128
128
  end
129
129
 
130
130
  def write_seg(reader)
131
131
  return if reader.read_string.nil?
132
- text = PrettyStrings::Cleaner.new(reader.read_string.force_encoding('UTF-8')).pretty.gsub("\\","&#92;").gsub("'",%q(\\\'))
132
+ text = PrettyStrings::Cleaner.new(reader.read_string).pretty.gsub("\\","&#92;").gsub("'",%q(\\\'))
133
133
  word_count = text.gsub("\s+", ' ').split(' ').length
134
134
  @doc[:seg][:vals] << [@doc[:tu][:id], @doc[:seg][:role], word_count, @doc[:seg][:lang], text, @doc[:tu][:creation_date]]
135
135
  end
data/tmx_importer.gemspec CHANGED
@@ -23,4 +23,5 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency "rspec"
24
24
  spec.add_runtime_dependency "libxml-ruby"
25
25
  spec.add_runtime_dependency "pretty_strings", "~> 0.5.0"
26
+ spec.add_runtime_dependency "charlock_holmes_bundle_icu", '~> 0.6.9.2'
26
27
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tmx_importer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-11 00:00:00.000000000 Z
11
+ date: 2016-03-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: 0.5.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: charlock_holmes_bundle_icu
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.6.9.2
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.6.9.2
83
97
  description: Import the content of a TMX translation memory file to your database
84
98
  email:
85
99
  - diasks2@gmail.com