tmx_importer 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 77a255ff1cd88f6d3edbc6748e2680ec86486abf
4
- data.tar.gz: 01001fd8f8c612fffcc83e110b77d9738299e046
3
+ metadata.gz: 07447d45355fb271c832db8228367a0d14d04c5f
4
+ data.tar.gz: ab468fdf9295379cd036b351648f37eb3477468f
5
5
  SHA512:
6
- metadata.gz: da87359b112880242cca91b25005b828017d7563b2915217759576da7a8b222e29cfce2b7df55fd3196075c25ab1168cb3f5de4c4cb73dc25958b1075a9c7a3a
7
- data.tar.gz: ae1aade7c0e33859dd4f6dcceebb5171d345ef5f7dececcd3e5e57f4657f86162ceb752ce0ebc0511b4d38ebccee0e49810c1b745727ac75b1463ea9aa947c89
6
+ metadata.gz: 508a95ae93f7a43af68362b97ab429d5f746486ad2bc2216174d35490b1c9952e8059e5eb111074c05ba40055d1e255d34d23439caae5ebfcebfa6dcceb0116e
7
+ data.tar.gz: 5aa98d1fb3a4ed65554f149c9d8be21fc2a3fe2b9cc02034bb63718ec1a7ec68049da278c9793d036cf450aaf7099ce1799eccc01e554e724d69e52614d11daa
@@ -1,3 +1,3 @@
1
1
  module TmxImporter
2
- VERSION = "0.4.0"
2
+ VERSION = "0.5.0"
3
3
  end
data/lib/tmx_importer.rb CHANGED
@@ -2,8 +2,7 @@ require 'tmx_importer/version'
2
2
  require 'xml'
3
3
  require 'open-uri'
4
4
  require 'pretty_strings'
5
-
6
- Encoding.default_external = Encoding::UTF_8
5
+ require 'charlock_holmes'
7
6
 
8
7
  module TmxImporter
9
8
  class Tmx
@@ -17,20 +16,15 @@ module TmxImporter
17
16
  seg: { lang: "", counter: 0, vals: [], role: "" },
18
17
  language_pairs: []
19
18
  }
20
- @src_regex = Regexp.new('(?<=srclang=\S)\S+(?=")|(?=\')'.encode(@encoding))
21
- @src_string = 'srclang='.encode(@encoding).freeze
22
- @tu_regex = Regexp.new('<\/tu>'.encode(@encoding))
23
- @seg_regex = Regexp.new('<\/seg>'.encode(@encoding))
24
- @lang_regex = Regexp.new('(?<=[^cn]lang=\S)\S+(?=")|(?=\')'.encode(@encoding))
25
- @lang_string = 'lang'.encode(@encoding).freeze
19
+ @text = CharlockHolmes::Converter.convert(File.read(open(@file_path)), encoding, 'UTF-8') if !@encoding.eql?('UTF-8')
26
20
  raise "Encoding type not supported. Please choose an encoding of UTF-8, UTF-16LE, or UTF-16BE" unless @encoding.eql?('UTF-8') || @encoding.eql?('UTF-16LE') || @encoding.eql?('UTF-16BE')
27
21
  end
28
22
 
29
23
  def stats
30
- File.open(@file_path, "rb:#{encoding}") do |file|
31
- file.each do |line|
32
- analyze_line(line)
33
- end
24
+ if encoding.eql?('UTF-8')
25
+ analyze_stats_utf_8
26
+ else
27
+ analyze_stats_utf_16
34
28
  end
35
29
  {tu_count: @doc[:tu][:counter], seg_count: @doc[:seg][:counter], language_pairs: @doc[:language_pairs].uniq}
36
30
  end
@@ -43,29 +37,35 @@ module TmxImporter
43
37
 
44
38
  private
45
39
 
46
- def read_file
47
- XML::Reader.io(open(file_path), options: XML::Parser::Options::NOERROR, encoding: set_encoding)
40
+ def analyze_stats_utf_8
41
+ File.readlines(@file_path).each do |line|
42
+ analyze_line(line)
43
+ end
48
44
  end
49
45
 
50
- def analyze_line(line)
51
- @doc[:source_language] = line.scan(@src_regex)[0].encode('UTF-8') if line.include?(@src_string)
52
- @doc[:tu][:counter] += line.scan(@tu_regex).count
53
- @doc[:seg][:counter] += line.scan(@seg_regex).count
54
- if line.include?(@lang_string)
55
- @doc[:seg][:lang] = line.scan(@lang_regex)[0]
56
- @doc[:seg][:lang] = @doc[:seg][:lang].encode('UTF-8') unless @doc[:seg][:lang].nil?
57
- write_language_pair
46
+ def analyze_stats_utf_16
47
+ @text.each_line do |line|
48
+ analyze_line(line)
58
49
  end
59
50
  end
60
51
 
61
- def set_encoding
62
- case encoding
63
- when 'UTF-8'
64
- xml_encoding = XML::Encoding::UTF_8
65
- when 'UTF-16LE'
66
- xml_encoding = XML::Encoding::UTF_16LE
67
- when 'UTF-16BE'
68
- xml_encoding = XML::Encoding::UTF_16BE
52
+ def read_file
53
+ if encoding.eql?('UTF-8')
54
+ XML::Reader.io(open(file_path), options: XML::Parser::Options::NOERROR, encoding: XML::Encoding::UTF_8)
55
+ else
56
+ reader = @text.gsub!(/(?<=encoding=").*(?=")/, 'utf-8').gsub(/&#x[0-1]?[0-9a-fA-F];/, ' ').gsub(/[\0-\x1f\x7f\u2028]/, ' ')
57
+ XML::Reader.string(reader, options: XML::Parser::Options::NOERROR, encoding: XML::Encoding::UTF_8)
58
+ end
59
+ end
60
+
61
+ def analyze_line(line)
62
+ @doc[:source_language] = line.scan(/(?<=srclang=\S)\S+(?=")|(?=')/)[0] if line.include?('srclang=')
63
+ @doc[:tu][:counter] += line.scan(/<\/tu>/).count
64
+ @doc[:seg][:counter] += line.scan(/<\/seg>/).count
65
+ if line.include?('lang')
66
+ @doc[:seg][:lang] = line.scan(/(?<=[^cn]lang=\S)\S+(?=")|(?=')/)[0]
67
+ @doc[:seg][:lang] = @doc[:seg][:lang] unless @doc[:seg][:lang].nil?
68
+ write_language_pair
69
69
  end
70
70
  end
71
71
 
@@ -92,13 +92,13 @@ module TmxImporter
92
92
  def eval_state_initial(tag_stack, reader)
93
93
  case tag_stack.last.bytes.to_a
94
94
  when [104, 101, 97, 100, 101, 114]
95
- @doc[:source_language] = reader.get_attribute("srclang").force_encoding("UTF-8") if @doc[:source_language].empty? && reader.has_attributes? && reader.get_attribute("srclang")
95
+ @doc[:source_language] = reader.get_attribute("srclang") if @doc[:source_language].empty? && reader.has_attributes? && reader.get_attribute("srclang")
96
96
  when [116, 117]
97
97
  write_tu(reader)
98
98
  @doc[:tu][:counter] += 1
99
99
  when [116, 117, 118]
100
100
  seg_lang = reader.get_attribute("lang") || reader.get_attribute("xml:lang")
101
- @doc[:seg][:lang] = seg_lang.force_encoding("UTF-8") unless seg_lang.empty?
101
+ @doc[:seg][:lang] = seg_lang unless seg_lang.empty?
102
102
  when [115, 101, 103]
103
103
  write_seg(reader)
104
104
  write_language_pair
@@ -111,10 +111,10 @@ module TmxImporter
111
111
  if @doc[:seg][:lang] != @doc[:source_language] &&
112
112
  @doc[:seg][:lang].split('-')[0].downcase != @doc[:source_language].split('-')[0].downcase &&
113
113
  @doc[:source_language] != '*all*'
114
- @doc[:language_pairs] << [@doc[:source_language].force_encoding("UTF-8"), @doc[:seg][:lang].force_encoding("UTF-8")]
114
+ @doc[:language_pairs] << [@doc[:source_language], @doc[:seg][:lang]]
115
115
  @doc[:seg][:role] = 'source'
116
116
  elsif @doc[:source_language] == '*all*'
117
- @doc[:source_language] = @doc[:seg][:lang].force_encoding("UTF-8")
117
+ @doc[:source_language] = @doc[:seg][:lang]
118
118
  @doc[:seg][:role] = 'source'
119
119
  else
120
120
  @doc[:seg][:role] = 'target'
@@ -123,13 +123,13 @@ module TmxImporter
123
123
 
124
124
  def write_tu(reader)
125
125
  @doc[:tu][:lang] = reader.get_attribute("srclang")
126
- @doc[:tu][:creation_date] = reader.get_attribute("creationdate").nil? ? DateTime.now.to_s : DateTime.parse(reader.get_attribute("creationdate").force_encoding('UTF-8')).to_s
126
+ @doc[:tu][:creation_date] = reader.get_attribute("creationdate").nil? ? DateTime.now.to_s : DateTime.parse(reader.get_attribute("creationdate")).to_s
127
127
  @doc[:tu][:vals] << [@doc[:tu][:id], @doc[:tu][:creation_date]]
128
128
  end
129
129
 
130
130
  def write_seg(reader)
131
131
  return if reader.read_string.nil?
132
- text = PrettyStrings::Cleaner.new(reader.read_string.force_encoding('UTF-8')).pretty.gsub("\\","&#92;").gsub("'",%q(\\\'))
132
+ text = PrettyStrings::Cleaner.new(reader.read_string).pretty.gsub("\\","&#92;").gsub("'",%q(\\\'))
133
133
  word_count = text.gsub("\s+", ' ').split(' ').length
134
134
  @doc[:seg][:vals] << [@doc[:tu][:id], @doc[:seg][:role], word_count, @doc[:seg][:lang], text, @doc[:tu][:creation_date]]
135
135
  end
data/tmx_importer.gemspec CHANGED
@@ -23,4 +23,5 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency "rspec"
24
24
  spec.add_runtime_dependency "libxml-ruby"
25
25
  spec.add_runtime_dependency "pretty_strings", "~> 0.5.0"
26
+ spec.add_runtime_dependency "charlock_holmes_bundle_icu", '~> 0.6.9.2'
26
27
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tmx_importer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-11 00:00:00.000000000 Z
11
+ date: 2016-03-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: 0.5.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: charlock_holmes_bundle_icu
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.6.9.2
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.6.9.2
83
97
  description: Import the content of a TMX translation memory file to your database
84
98
  email:
85
99
  - diasks2@gmail.com