tmx_importer 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/tmx_importer/version.rb +1 -1
- data/lib/tmx_importer.rb +36 -36
- data/tmx_importer.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 07447d45355fb271c832db8228367a0d14d04c5f
|
4
|
+
data.tar.gz: ab468fdf9295379cd036b351648f37eb3477468f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 508a95ae93f7a43af68362b97ab429d5f746486ad2bc2216174d35490b1c9952e8059e5eb111074c05ba40055d1e255d34d23439caae5ebfcebfa6dcceb0116e
|
7
|
+
data.tar.gz: 5aa98d1fb3a4ed65554f149c9d8be21fc2a3fe2b9cc02034bb63718ec1a7ec68049da278c9793d036cf450aaf7099ce1799eccc01e554e724d69e52614d11daa
|
data/lib/tmx_importer/version.rb
CHANGED
data/lib/tmx_importer.rb
CHANGED
@@ -2,8 +2,7 @@ require 'tmx_importer/version'
|
|
2
2
|
require 'xml'
|
3
3
|
require 'open-uri'
|
4
4
|
require 'pretty_strings'
|
5
|
-
|
6
|
-
Encoding.default_external = Encoding::UTF_8
|
5
|
+
require 'charlock_holmes'
|
7
6
|
|
8
7
|
module TmxImporter
|
9
8
|
class Tmx
|
@@ -17,20 +16,15 @@ module TmxImporter
|
|
17
16
|
seg: { lang: "", counter: 0, vals: [], role: "" },
|
18
17
|
language_pairs: []
|
19
18
|
}
|
20
|
-
@
|
21
|
-
@src_string = 'srclang='.encode(@encoding).freeze
|
22
|
-
@tu_regex = Regexp.new('<\/tu>'.encode(@encoding))
|
23
|
-
@seg_regex = Regexp.new('<\/seg>'.encode(@encoding))
|
24
|
-
@lang_regex = Regexp.new('(?<=[^cn]lang=\S)\S+(?=")|(?=\')'.encode(@encoding))
|
25
|
-
@lang_string = 'lang'.encode(@encoding).freeze
|
19
|
+
@text = CharlockHolmes::Converter.convert(File.read(open(@file_path)), encoding, 'UTF-8') if !@encoding.eql?('UTF-8')
|
26
20
|
raise "Encoding type not supported. Please choose an encoding of UTF-8, UTF-16LE, or UTF-16BE" unless @encoding.eql?('UTF-8') || @encoding.eql?('UTF-16LE') || @encoding.eql?('UTF-16BE')
|
27
21
|
end
|
28
22
|
|
29
23
|
def stats
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
24
|
+
if encoding.eql?('UTF-8')
|
25
|
+
analyze_stats_utf_8
|
26
|
+
else
|
27
|
+
analyze_stats_utf_16
|
34
28
|
end
|
35
29
|
{tu_count: @doc[:tu][:counter], seg_count: @doc[:seg][:counter], language_pairs: @doc[:language_pairs].uniq}
|
36
30
|
end
|
@@ -43,29 +37,35 @@ module TmxImporter
|
|
43
37
|
|
44
38
|
private
|
45
39
|
|
46
|
-
def
|
47
|
-
|
40
|
+
def analyze_stats_utf_8
|
41
|
+
File.readlines(@file_path).each do |line|
|
42
|
+
analyze_line(line)
|
43
|
+
end
|
48
44
|
end
|
49
45
|
|
50
|
-
def
|
51
|
-
@
|
52
|
-
|
53
|
-
@doc[:seg][:counter] += line.scan(@seg_regex).count
|
54
|
-
if line.include?(@lang_string)
|
55
|
-
@doc[:seg][:lang] = line.scan(@lang_regex)[0]
|
56
|
-
@doc[:seg][:lang] = @doc[:seg][:lang].encode('UTF-8') unless @doc[:seg][:lang].nil?
|
57
|
-
write_language_pair
|
46
|
+
def analyze_stats_utf_16
|
47
|
+
@text.each_line do |line|
|
48
|
+
analyze_line(line)
|
58
49
|
end
|
59
50
|
end
|
60
51
|
|
61
|
-
def
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
52
|
+
def read_file
|
53
|
+
if encoding.eql?('UTF-8')
|
54
|
+
XML::Reader.io(open(file_path), options: XML::Parser::Options::NOERROR, encoding: XML::Encoding::UTF_8)
|
55
|
+
else
|
56
|
+
reader = @text.gsub!(/(?<=encoding=").*(?=")/, 'utf-8').gsub(/&#x[0-1]?[0-9a-fA-F];/, ' ').gsub(/[\0-\x1f\x7f\u2028]/, ' ')
|
57
|
+
XML::Reader.string(reader, options: XML::Parser::Options::NOERROR, encoding: XML::Encoding::UTF_8)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def analyze_line(line)
|
62
|
+
@doc[:source_language] = line.scan(/(?<=srclang=\S)\S+(?=")|(?=')/)[0] if line.include?('srclang=')
|
63
|
+
@doc[:tu][:counter] += line.scan(/<\/tu>/).count
|
64
|
+
@doc[:seg][:counter] += line.scan(/<\/seg>/).count
|
65
|
+
if line.include?('lang')
|
66
|
+
@doc[:seg][:lang] = line.scan(/(?<=[^cn]lang=\S)\S+(?=")|(?=')/)[0]
|
67
|
+
@doc[:seg][:lang] = @doc[:seg][:lang] unless @doc[:seg][:lang].nil?
|
68
|
+
write_language_pair
|
69
69
|
end
|
70
70
|
end
|
71
71
|
|
@@ -92,13 +92,13 @@ module TmxImporter
|
|
92
92
|
def eval_state_initial(tag_stack, reader)
|
93
93
|
case tag_stack.last.bytes.to_a
|
94
94
|
when [104, 101, 97, 100, 101, 114]
|
95
|
-
@doc[:source_language] = reader.get_attribute("srclang")
|
95
|
+
@doc[:source_language] = reader.get_attribute("srclang") if @doc[:source_language].empty? && reader.has_attributes? && reader.get_attribute("srclang")
|
96
96
|
when [116, 117]
|
97
97
|
write_tu(reader)
|
98
98
|
@doc[:tu][:counter] += 1
|
99
99
|
when [116, 117, 118]
|
100
100
|
seg_lang = reader.get_attribute("lang") || reader.get_attribute("xml:lang")
|
101
|
-
@doc[:seg][:lang] = seg_lang
|
101
|
+
@doc[:seg][:lang] = seg_lang unless seg_lang.empty?
|
102
102
|
when [115, 101, 103]
|
103
103
|
write_seg(reader)
|
104
104
|
write_language_pair
|
@@ -111,10 +111,10 @@ module TmxImporter
|
|
111
111
|
if @doc[:seg][:lang] != @doc[:source_language] &&
|
112
112
|
@doc[:seg][:lang].split('-')[0].downcase != @doc[:source_language].split('-')[0].downcase &&
|
113
113
|
@doc[:source_language] != '*all*'
|
114
|
-
@doc[:language_pairs] << [@doc[:source_language]
|
114
|
+
@doc[:language_pairs] << [@doc[:source_language], @doc[:seg][:lang]]
|
115
115
|
@doc[:seg][:role] = 'source'
|
116
116
|
elsif @doc[:source_language] == '*all*'
|
117
|
-
@doc[:source_language] = @doc[:seg][:lang]
|
117
|
+
@doc[:source_language] = @doc[:seg][:lang]
|
118
118
|
@doc[:seg][:role] = 'source'
|
119
119
|
else
|
120
120
|
@doc[:seg][:role] = 'target'
|
@@ -123,13 +123,13 @@ module TmxImporter
|
|
123
123
|
|
124
124
|
def write_tu(reader)
|
125
125
|
@doc[:tu][:lang] = reader.get_attribute("srclang")
|
126
|
-
@doc[:tu][:creation_date] = reader.get_attribute("creationdate").nil? ? DateTime.now.to_s : DateTime.parse(reader.get_attribute("creationdate")
|
126
|
+
@doc[:tu][:creation_date] = reader.get_attribute("creationdate").nil? ? DateTime.now.to_s : DateTime.parse(reader.get_attribute("creationdate")).to_s
|
127
127
|
@doc[:tu][:vals] << [@doc[:tu][:id], @doc[:tu][:creation_date]]
|
128
128
|
end
|
129
129
|
|
130
130
|
def write_seg(reader)
|
131
131
|
return if reader.read_string.nil?
|
132
|
-
text = PrettyStrings::Cleaner.new(reader.read_string
|
132
|
+
text = PrettyStrings::Cleaner.new(reader.read_string).pretty.gsub("\\","\").gsub("'",%q(\\\'))
|
133
133
|
word_count = text.gsub("\s+", ' ').split(' ').length
|
134
134
|
@doc[:seg][:vals] << [@doc[:tu][:id], @doc[:seg][:role], word_count, @doc[:seg][:lang], text, @doc[:tu][:creation_date]]
|
135
135
|
end
|
data/tmx_importer.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tmx_importer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 0.5.0
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: charlock_holmes_bundle_icu
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.6.9.2
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.6.9.2
|
83
97
|
description: Import the content of a TMX translation memory file to your database
|
84
98
|
email:
|
85
99
|
- diasks2@gmail.com
|