tmx_importer 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/tmx_importer/version.rb +1 -1
- data/lib/tmx_importer.rb +36 -36
- data/tmx_importer.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 07447d45355fb271c832db8228367a0d14d04c5f
|
4
|
+
data.tar.gz: ab468fdf9295379cd036b351648f37eb3477468f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 508a95ae93f7a43af68362b97ab429d5f746486ad2bc2216174d35490b1c9952e8059e5eb111074c05ba40055d1e255d34d23439caae5ebfcebfa6dcceb0116e
|
7
|
+
data.tar.gz: 5aa98d1fb3a4ed65554f149c9d8be21fc2a3fe2b9cc02034bb63718ec1a7ec68049da278c9793d036cf450aaf7099ce1799eccc01e554e724d69e52614d11daa
|
data/lib/tmx_importer/version.rb
CHANGED
data/lib/tmx_importer.rb
CHANGED
@@ -2,8 +2,7 @@ require 'tmx_importer/version'
|
|
2
2
|
require 'xml'
|
3
3
|
require 'open-uri'
|
4
4
|
require 'pretty_strings'
|
5
|
-
|
6
|
-
Encoding.default_external = Encoding::UTF_8
|
5
|
+
require 'charlock_holmes'
|
7
6
|
|
8
7
|
module TmxImporter
|
9
8
|
class Tmx
|
@@ -17,20 +16,15 @@ module TmxImporter
|
|
17
16
|
seg: { lang: "", counter: 0, vals: [], role: "" },
|
18
17
|
language_pairs: []
|
19
18
|
}
|
20
|
-
@
|
21
|
-
@src_string = 'srclang='.encode(@encoding).freeze
|
22
|
-
@tu_regex = Regexp.new('<\/tu>'.encode(@encoding))
|
23
|
-
@seg_regex = Regexp.new('<\/seg>'.encode(@encoding))
|
24
|
-
@lang_regex = Regexp.new('(?<=[^cn]lang=\S)\S+(?=")|(?=\')'.encode(@encoding))
|
25
|
-
@lang_string = 'lang'.encode(@encoding).freeze
|
19
|
+
@text = CharlockHolmes::Converter.convert(File.read(open(@file_path)), encoding, 'UTF-8') if !@encoding.eql?('UTF-8')
|
26
20
|
raise "Encoding type not supported. Please choose an encoding of UTF-8, UTF-16LE, or UTF-16BE" unless @encoding.eql?('UTF-8') || @encoding.eql?('UTF-16LE') || @encoding.eql?('UTF-16BE')
|
27
21
|
end
|
28
22
|
|
29
23
|
def stats
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
24
|
+
if encoding.eql?('UTF-8')
|
25
|
+
analyze_stats_utf_8
|
26
|
+
else
|
27
|
+
analyze_stats_utf_16
|
34
28
|
end
|
35
29
|
{tu_count: @doc[:tu][:counter], seg_count: @doc[:seg][:counter], language_pairs: @doc[:language_pairs].uniq}
|
36
30
|
end
|
@@ -43,29 +37,35 @@ module TmxImporter
|
|
43
37
|
|
44
38
|
private
|
45
39
|
|
46
|
-
def
|
47
|
-
|
40
|
+
def analyze_stats_utf_8
|
41
|
+
File.readlines(@file_path).each do |line|
|
42
|
+
analyze_line(line)
|
43
|
+
end
|
48
44
|
end
|
49
45
|
|
50
|
-
def
|
51
|
-
@
|
52
|
-
|
53
|
-
@doc[:seg][:counter] += line.scan(@seg_regex).count
|
54
|
-
if line.include?(@lang_string)
|
55
|
-
@doc[:seg][:lang] = line.scan(@lang_regex)[0]
|
56
|
-
@doc[:seg][:lang] = @doc[:seg][:lang].encode('UTF-8') unless @doc[:seg][:lang].nil?
|
57
|
-
write_language_pair
|
46
|
+
def analyze_stats_utf_16
|
47
|
+
@text.each_line do |line|
|
48
|
+
analyze_line(line)
|
58
49
|
end
|
59
50
|
end
|
60
51
|
|
61
|
-
def
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
52
|
+
def read_file
|
53
|
+
if encoding.eql?('UTF-8')
|
54
|
+
XML::Reader.io(open(file_path), options: XML::Parser::Options::NOERROR, encoding: XML::Encoding::UTF_8)
|
55
|
+
else
|
56
|
+
reader = @text.gsub!(/(?<=encoding=").*(?=")/, 'utf-8').gsub(/&#x[0-1]?[0-9a-fA-F];/, ' ').gsub(/[\0-\x1f\x7f\u2028]/, ' ')
|
57
|
+
XML::Reader.string(reader, options: XML::Parser::Options::NOERROR, encoding: XML::Encoding::UTF_8)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def analyze_line(line)
|
62
|
+
@doc[:source_language] = line.scan(/(?<=srclang=\S)\S+(?=")|(?=')/)[0] if line.include?('srclang=')
|
63
|
+
@doc[:tu][:counter] += line.scan(/<\/tu>/).count
|
64
|
+
@doc[:seg][:counter] += line.scan(/<\/seg>/).count
|
65
|
+
if line.include?('lang')
|
66
|
+
@doc[:seg][:lang] = line.scan(/(?<=[^cn]lang=\S)\S+(?=")|(?=')/)[0]
|
67
|
+
@doc[:seg][:lang] = @doc[:seg][:lang] unless @doc[:seg][:lang].nil?
|
68
|
+
write_language_pair
|
69
69
|
end
|
70
70
|
end
|
71
71
|
|
@@ -92,13 +92,13 @@ module TmxImporter
|
|
92
92
|
def eval_state_initial(tag_stack, reader)
|
93
93
|
case tag_stack.last.bytes.to_a
|
94
94
|
when [104, 101, 97, 100, 101, 114]
|
95
|
-
@doc[:source_language] = reader.get_attribute("srclang")
|
95
|
+
@doc[:source_language] = reader.get_attribute("srclang") if @doc[:source_language].empty? && reader.has_attributes? && reader.get_attribute("srclang")
|
96
96
|
when [116, 117]
|
97
97
|
write_tu(reader)
|
98
98
|
@doc[:tu][:counter] += 1
|
99
99
|
when [116, 117, 118]
|
100
100
|
seg_lang = reader.get_attribute("lang") || reader.get_attribute("xml:lang")
|
101
|
-
@doc[:seg][:lang] = seg_lang
|
101
|
+
@doc[:seg][:lang] = seg_lang unless seg_lang.empty?
|
102
102
|
when [115, 101, 103]
|
103
103
|
write_seg(reader)
|
104
104
|
write_language_pair
|
@@ -111,10 +111,10 @@ module TmxImporter
|
|
111
111
|
if @doc[:seg][:lang] != @doc[:source_language] &&
|
112
112
|
@doc[:seg][:lang].split('-')[0].downcase != @doc[:source_language].split('-')[0].downcase &&
|
113
113
|
@doc[:source_language] != '*all*'
|
114
|
-
@doc[:language_pairs] << [@doc[:source_language]
|
114
|
+
@doc[:language_pairs] << [@doc[:source_language], @doc[:seg][:lang]]
|
115
115
|
@doc[:seg][:role] = 'source'
|
116
116
|
elsif @doc[:source_language] == '*all*'
|
117
|
-
@doc[:source_language] = @doc[:seg][:lang]
|
117
|
+
@doc[:source_language] = @doc[:seg][:lang]
|
118
118
|
@doc[:seg][:role] = 'source'
|
119
119
|
else
|
120
120
|
@doc[:seg][:role] = 'target'
|
@@ -123,13 +123,13 @@ module TmxImporter
|
|
123
123
|
|
124
124
|
def write_tu(reader)
|
125
125
|
@doc[:tu][:lang] = reader.get_attribute("srclang")
|
126
|
-
@doc[:tu][:creation_date] = reader.get_attribute("creationdate").nil? ? DateTime.now.to_s : DateTime.parse(reader.get_attribute("creationdate")
|
126
|
+
@doc[:tu][:creation_date] = reader.get_attribute("creationdate").nil? ? DateTime.now.to_s : DateTime.parse(reader.get_attribute("creationdate")).to_s
|
127
127
|
@doc[:tu][:vals] << [@doc[:tu][:id], @doc[:tu][:creation_date]]
|
128
128
|
end
|
129
129
|
|
130
130
|
def write_seg(reader)
|
131
131
|
return if reader.read_string.nil?
|
132
|
-
text = PrettyStrings::Cleaner.new(reader.read_string
|
132
|
+
text = PrettyStrings::Cleaner.new(reader.read_string).pretty.gsub("\\","\").gsub("'",%q(\\\'))
|
133
133
|
word_count = text.gsub("\s+", ' ').split(' ').length
|
134
134
|
@doc[:seg][:vals] << [@doc[:tu][:id], @doc[:seg][:role], word_count, @doc[:seg][:lang], text, @doc[:tu][:creation_date]]
|
135
135
|
end
|
data/tmx_importer.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tmx_importer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 0.5.0
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: charlock_holmes_bundle_icu
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.6.9.2
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.6.9.2
|
83
97
|
description: Import the content of a TMX translation memory file to your database
|
84
98
|
email:
|
85
99
|
- diasks2@gmail.com
|