tmx_importer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 99a80fae6e289623de257e8383f5ba8e6d90eda5
4
+ data.tar.gz: 150248c983d19427e7cf3ccbfc98c4a63227436f
5
+ SHA512:
6
+ metadata.gz: 9b869fc95d11ba967d43ef4f54914d557d10bbaa57c44831ed86e4fe36787159262a27b6c2dbcdec3902dc4936a1e7e72a6327f10d30a2576a45487dc5e7432d
7
+ data.tar.gz: 0c11e503f9447569d90aaf49199691fd7bb54bd9d24b0a4371d8242259b9353183da08dbe6c751fa10d36c9488b13aa8f2b1c0886c0a06f1cf62f703eef1333d
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.1.2
4
+ - 2.2.4
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in tmx_importer.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Kevin S. Dias
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,70 @@
1
+ # TMX Importer
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/tmx_importer.svg)](https://badge.fury.io/rb/tmx_importer) [![Build Status](https://travis-ci.org/diasks2/tmx_importer.png)](https://travis-ci.org/diasks2/tmx_importer) [![License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](https://github.com/diasks2/tmx_importer/blob/master/LICENSE.txt)
4
+
5
+ This gem handles the importing and parsing of [.tmx translation memory files](http://www.ttt.org/oscarstandards/tmx/tmx14-20020710.htm). TMX files are xml files.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ **Ruby**
12
+ ```
13
+ gem install tmx_importer
14
+ ```
15
+
16
+ **Ruby on Rails**
17
+ Add this line to your application’s Gemfile:
18
+ ```ruby
19
+ gem 'tmx_importer'
20
+ ```
21
+
22
+ ## Usage
23
+
24
+ ```ruby
25
+ # Get the high level stats of a TMX file
26
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/test_tm(utf-8).tmx')
27
+ TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-8').stats
28
+ # => {:tu_count=>4, :seg_count=>8, :language_pairs=>[["de-DE", "en-US"]]}
29
+
30
+ # Extract the segments of a TMX file
31
+ # Result: [translation_units, segments]
32
+ # translation_units = [tu_id, creation_date]
33
+ # segments = [tu_id, segment_role, word_count, language, segment_text]
34
+
35
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/test_tm(utf-8).tmx')
36
+ TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-8').import
37
+ # => [[["5533-1457670156-1", "2016-03-11T13:22:36+09:00"], ["6836-1457670156-3", "2016-03-11T13:22:36+09:00"], ["3285-1457670156-5", "2016-03-11T13:22:36+09:00"], ["6706-1457670156-7", "2016-03-11T13:22:36+09:00"]], [["5533-1457670156-1", "", 1, "de-DE", "überprüfen"], ["5533-1457670156-1", "target", 1, "en-US", "check"], ["6836-1457670156-3", "source", 1, "de-DE", "Rückenlehneneinstellung"], ["6836-1457670156-3", "target", 2, "en-US", "Backrest adjustment"], ["3285-1457670156-5", "source", 1, "de-DE", "Bezüglich"], ["3285-1457670156-5", "target", 3, "en-US", "In terms of"], ["6706-1457670156-7", "source", 20, "de-DE", "Der Staatsschutz prüft, ob es einen Zusammenhang mit einem Anschlag auf eine geplante Flüchtlingsunterkunft in der Nachbarschaft Ende August gibt."], ["6706-1457670156-7", "target", 23, "en-US", "The state protection checks whether there is a connection with an attack on a planned refugee camp in the neighborhood of late August."]]]
38
+ ```
39
+
40
+ ## Contributing
41
+
42
+ 1. Fork it ( https://github.com/diasks2/tmx_importer/fork )
43
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
44
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
45
+ 4. Push to the branch (`git push origin my-new-feature`)
46
+ 5. Create a new Pull Request
47
+
48
+ ## License
49
+
50
+ The MIT License (MIT)
51
+
52
+ Copyright (c) 2016 Kevin S. Dias
53
+
54
+ Permission is hereby granted, free of charge, to any person obtaining a copy
55
+ of this software and associated documentation files (the "Software"), to deal
56
+ in the Software without restriction, including without limitation the rights
57
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
58
+ copies of the Software, and to permit persons to whom the Software is
59
+ furnished to do so, subject to the following conditions:
60
+
61
+ The above copyright notice and this permission notice shall be included in
62
+ all copies or substantial portions of the Software.
63
+
64
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
65
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
66
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
67
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
68
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
69
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
70
+ THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,5 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+ task default: :spec
@@ -0,0 +1,3 @@
1
+ module TmxImporter
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,135 @@
1
+ require 'tmx_importer/version'
2
+ require 'xml'
3
+ require 'open-uri'
4
+ require 'pretty_strings'
5
+
6
+ Encoding.default_internal = Encoding::UTF_8
7
+ Encoding.default_external = Encoding::UTF_8
8
+
9
+ module TmxImporter
10
+ class Tmx
11
+ attr_reader :file_path, :encoding
12
+ def initialize(file_path:, encoding:)
13
+ @file_path = file_path
14
+ @encoding = encoding.upcase
15
+ @doc = {
16
+ source_language: "",
17
+ tu: { id: "", counter: 0, vals: [], lang: "" },
18
+ seg: { lang: "", counter: 0, vals: [], role: "" },
19
+ language_pairs: []
20
+ }
21
+ raise "Encoding type not supported. Please choose an encoding of UTF-8, UTF-16LE, or UTF-16BE" unless @encoding.eql?('UTF-8') || @encoding.eql?('UTF-16LE') || @encoding.eql?('UTF-16BE')
22
+ end
23
+
24
+ def stats
25
+ File.open(@file_path, "rb:#{encoding}") do |file|
26
+ file.each do |line|
27
+ analyze_line(line)
28
+ end
29
+ end
30
+ {tu_count: @doc[:tu][:counter], seg_count: @doc[:seg][:counter], language_pairs: @doc[:language_pairs].uniq}
31
+ end
32
+
33
+ def import
34
+ reader = read_file
35
+ parse_file(reader)
36
+ [@doc[:tu][:vals], @doc[:seg][:vals]]
37
+ end
38
+
39
+ private
40
+
41
+ def read_file
42
+ XML::Reader.io(open(file_path), options: XML::Parser::Options::NOERROR, encoding: set_encoding)
43
+ end
44
+
45
+ def analyze_line(line)
46
+ @doc[:source_language] = line.scan(/(?<=srclang=\S)\S+(?=")|(?=')/)[0] if line.include?('srclang=')
47
+ @doc[:tu][:counter] += line.scan(/<\/tu>/).count
48
+ @doc[:seg][:counter] += line.scan(/<\/seg>/).count
49
+ if line.include?('lang')
50
+ @doc[:seg][:lang] = line.scan(/(?<=[^cn]lang=\S)\S+(?=")|(?=')/)[0]
51
+ write_language_pair
52
+ end
53
+ end
54
+
55
+ def set_encoding
56
+ case encoding
57
+ when 'UTF-8'
58
+ xml_encoding = XML::Encoding::UTF_8
59
+ when 'UTF-16LE'
60
+ xml_encoding = XML::Encoding::UTF_16LE
61
+ when 'UTF-16BE'
62
+ xml_encoding = XML::Encoding::UTF_16BE
63
+ end
64
+ end
65
+
66
+ def parse_file(reader)
67
+ tag_stack = []
68
+ generate_unique_id
69
+ while reader.read do
70
+ tag_stack.delete_if { |d| d.bytes.to_a == [101, 112, 116] ||
71
+ d.bytes.to_a == [98, 112, 116] ||
72
+ d.bytes.to_a == [112, 114, 111, 112] ||
73
+ d.bytes.to_a == [112, 104] }
74
+ if !tag_stack.include?(reader.name)
75
+ tag_stack.push(reader.name)
76
+ eval_state_initial(tag_stack, reader)
77
+ elsif tag_stack.last == reader.name
78
+ d = tag_stack.dup.pop
79
+ tag_stack.pop if d.bytes.to_a == [35, 116, 101, 120, 116]
80
+ generate_unique_id if tag_stack.length > 3 && tag_stack.pop.bytes.to_a == [116, 117]
81
+ end
82
+ end
83
+ reader.close
84
+ end
85
+
86
+ def eval_state_initial(tag_stack, reader)
87
+ case tag_stack.last.bytes.to_a
88
+ when [104, 101, 97, 100, 101, 114]
89
+ @doc[:source_language] = reader.get_attribute("srclang").force_encoding("UTF-8") if @doc[:source_language].empty? && reader.has_attributes? && reader.get_attribute("srclang")
90
+ when [116, 117]
91
+ write_tu(reader)
92
+ @doc[:tu][:counter] += 1
93
+ when [116, 117, 118]
94
+ seg_lang = reader.get_attribute("lang") || reader.get_attribute("xml:lang")
95
+ @doc[:seg][:lang] = seg_lang.force_encoding("UTF-8") unless seg_lang.empty?
96
+ when [115, 101, 103]
97
+ write_seg(reader)
98
+ write_language_pair
99
+ @doc[:seg][:counter] += 1
100
+ end
101
+ end
102
+
103
+ def write_language_pair
104
+ return if @doc[:seg][:lang].nil? || @doc[:seg][:lang].empty? || @doc[:source_language].nil? || @doc[:source_language].empty?
105
+ if @doc[:seg][:lang] != @doc[:source_language] &&
106
+ @doc[:seg][:lang].split('-')[0].downcase != @doc[:source_language].split('-')[0].downcase &&
107
+ @doc[:source_language] != '*all*'
108
+ @doc[:language_pairs] << [@doc[:source_language], @doc[:seg][:lang]]
109
+ @doc[:seg][:role] = 'source'
110
+ elsif @doc[:source_language] == '*all*'
111
+ @doc[:source_language] = @doc[:seg][:lang]
112
+ @doc[:seg][:role] = 'source'
113
+ else
114
+ @doc[:seg][:role] = 'target'
115
+ end
116
+ end
117
+
118
+ def write_tu(reader)
119
+ @doc[:tu][:lang] = reader.get_attribute("srclang")
120
+ created_date = reader.get_attribute("creationdate").nil? ? DateTime.now.to_s : DateTime.parse(reader.get_attribute("creationdate")).to_s
121
+ @doc[:tu][:vals] << [@doc[:tu][:id], created_date]
122
+ end
123
+
124
+ def write_seg(reader)
125
+ return if reader.read_string.empty?
126
+ text = PrettyStrings::Cleaner.new(reader.read_string.force_encoding('UTF-8')).pretty.gsub("\\","&#92;").gsub("'",%q(\\\'))
127
+ word_count = text.gsub("\s+", ' ').split(' ').length
128
+ @doc[:seg][:vals] << [@doc[:tu][:id], @doc[:seg][:role], word_count, @doc[:seg][:lang], text]
129
+ end
130
+
131
+ def generate_unique_id
132
+ @doc[:tu][:id] = [(1..4).map{rand(10)}.join(''), Time.now.to_i, @doc[:tu][:counter] += 1 ].join("-")
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,2 @@
1
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
2
+ require 'tmx_importer'
@@ -0,0 +1,38 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <tmx version="1.4">
3
+ <header creationtool="AwesomeTool" creationtoolversion="2.0" datatype="plaintext" segtype="sentence" o-tmf="AwesomeTool TMX" adminlang="EN-US" srclang="de-DE"></header>
4
+ <body>
5
+ <tu tuid="1">
6
+ <tuv xml:lang="de-DE">
7
+ <seg>�berpr�fen</seg>
8
+ </tuv>
9
+ <tuv xml:lang="en-US">
10
+ <seg>check</seg>
11
+ </tuv>
12
+ </tu>
13
+ <tu tuid="2">
14
+ <tuv xml:lang="de-DE">
15
+ <seg>R�ckenlehneneinstellung</seg>
16
+ </tuv>
17
+ <tuv xml:lang="en-US">
18
+ <seg>Backrest adjustment</seg>
19
+ </tuv>
20
+ </tu>
21
+ <tu tuid="3">
22
+ <tuv xml:lang="de-DE">
23
+ <seg>Bez�glich</seg>
24
+ </tuv>
25
+ <tuv xml:lang="en-US">
26
+ <seg>In terms of</seg>
27
+ </tuv>
28
+ </tu>
29
+ <tu tuid="4">
30
+ <tuv xml:lang="de-DE">
31
+ <seg>Der Staatsschutz pr�ft, ob es einen Zusammenhang mit einem Anschlag auf eine geplante Fl�chtlingsunterkunft in der Nachbarschaft Ende August gibt.</seg>
32
+ </tuv>
33
+ <tuv xml:lang="en-US">
34
+ <seg>The state protection checks whether there is a connection with an attack on a planned refugee camp in the neighborhood of late August.</seg>
35
+ </tuv>
36
+ </tu>
37
+ </body>
38
+ </tmx>
@@ -0,0 +1,72 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <!DOCTYPE tmx SYSTEM "tmx11.dtd">
3
+ <tmx version="1.1">
4
+ <header
5
+ creationtool="AwesomeTool"
6
+ creationtoolversion="Edition 8 Build 863"
7
+ segtype="sentence"
8
+ o-tmf="TW4Win 2.0 Format"
9
+ adminlang="EN-US"
10
+ srclang="EN-US"
11
+ datatype="rtf"
12
+ creationdate="20110527T214247Z"
13
+ creationid="A"
14
+ >
15
+ <prop type="RTFFontTable">
16
+ {\fonttbl
17
+ {\f1 \fmodern\fprq1 \fcharset0 Courier New;}
18
+ {\f2 \fswiss\fprq2 \fcharset0 Arial;}
19
+ {\f3 \fcharset238 Arial CE;}
20
+ {\f4 \fmodern\fprq1 \fcharset238 Courier New CE;}
21
+ {\f5 \fcharset0 Arial Unicode MS;}
22
+ {\f6 \fcharset238 Arial Unicode MS CE;}
23
+ {\f7 \fcharset238 Tahoma CE;}
24
+ {\f8 \fcharset0 Tahoma;}}</prop>
25
+ <prop type="RTFStyleSheet">
26
+ {\stylesheet
27
+ {\St \s0 {\StN Normal}}
28
+ {\St \cs1 {\StB \v\f1\fs24\sub\cf12 }{\StN tw4winMark}}
29
+ {\St \cs2 {\StB \cf4\fs40\f1 }{\StN tw4winError}}
30
+ {\St \cs3 {\StB \f1\cf11\lang1024 }{\StN tw4winPopup}}
31
+ {\St \cs4 {\StB \f1\cf10\lang1024 }{\StN tw4winJump}}
32
+ {\St \cs5 {\StB \f1\cf15\lang1024 }{\StN tw4winExternal}}
33
+ {\St \cs6 {\StB \f1\cf6\lang1024 }{\StN tw4winInternal}}
34
+ {\St \cs7 {\StB \cf2 }{\StN tw4winTerm}}
35
+ {\St \cs8 {\StB \f1\cf13\lang1024 }{\StN DO_NOT_TRANSLATE}}}</prop>
36
+ </header>
37
+
38
+ <body>
39
+ <tu tuid=""1"">
40
+ <tuv xml:lang="de-DE">
41
+ <seg>überprüfen</seg>
42
+ </tuv>
43
+ <tuv xml:lang="en-US">
44
+ <seg>check</seg>
45
+ </tuv>
46
+ </tu>
47
+ <tu tuid="2">
48
+ <tuv xml:lang="de-DE">
49
+ <seg>Rückenlehneneinstellung</seg>
50
+ </tuv>
51
+ <tuv xml:lang="en-US">
52
+ <seg>Backrest adjustment</seg>
53
+ </tuv>
54
+ </tu>
55
+ <tu tuid="3">
56
+ <tuv xml:lang="de-DE">
57
+ <seg>Bezüglich</seg>
58
+ </tuv>
59
+ <tuv xml:lang="en-US">
60
+ <seg>In terms of</seg>
61
+ </tuv>
62
+ </tu>
63
+ <tu tuid="4">
64
+ <tuv xml:lang="de-DE">
65
+ <seg>Der Staatsschutz prüft, ob es einen Zusammenhang mit einem Anschlag auf eine geplante Flüchtlingsunterkunft in der Nachbarschaft Ende August gibt.</seg>
66
+ </tuv>
67
+ <tuv xml:lang="en-US">
68
+ <seg>The state protection checks whether there is a connection with an attack on a planned refugee camp in the neighborhood of late August.</seg>
69
+ </tuv>
70
+ </tu>
71
+ </body>
72
+ </tmx>
@@ -0,0 +1,44 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <tmx version="1.4">
3
+ <header creationtool="AwesomeTool" creationtoolversion="2.0" datatype="plaintext" segtype="sentence" o-tmf="AwesomeTool TMX" adminlang="EN-US" srclang="de-DE"></header>
4
+ <body>
5
+ <tu tuid="1">
6
+ <tuv xml:lang="de-DE">
7
+ <seg>überprüfen</seg>
8
+ </tuv>
9
+ <tuv xml:lang="en-US">
10
+ <seg>check</seg>
11
+ </tuv>
12
+ <tuv xml:lang="it">
13
+ <seg>controllare</seg>
14
+ </tuv>
15
+ <tuv xml:lang="fr">
16
+ <seg>vérifier</seg>
17
+ </tuv>
18
+ </tu>
19
+ <tu tuid="2">
20
+ <tuv xml:lang="de-DE">
21
+ <seg>Rückenlehneneinstellung</seg>
22
+ </tuv>
23
+ <tuv xml:lang="en-US">
24
+ <seg>Backrest adjustment</seg>
25
+ </tuv>
26
+ </tu>
27
+ <tu tuid="3">
28
+ <tuv xml:lang="de-DE">
29
+ <seg>Bezüglich</seg>
30
+ </tuv>
31
+ <tuv xml:lang="en-US">
32
+ <seg>In terms of</seg>
33
+ </tuv>
34
+ </tu>
35
+ <tu tuid="4">
36
+ <tuv xml:lang="de-DE">
37
+ <seg>Der Staatsschutz prüft, ob es einen Zusammenhang mit einem Anschlag auf eine geplante Flüchtlingsunterkunft in der Nachbarschaft Ende August gibt.</seg>
38
+ </tuv>
39
+ <tuv xml:lang="en-US">
40
+ <seg>The state protection checks whether there is a connection with an attack on a planned refugee camp in the neighborhood of late August.</seg>
41
+ </tuv>
42
+ </tu>
43
+ </body>
44
+ </tmx>
@@ -0,0 +1,38 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <tmx version="1.4">
3
+ <header creationtool="AwesomeTool" creationtoolversion="2.0" datatype="plaintext" segtype="sentence" o-tmf="AwesomeTool TMX" adminlang="EN-US" srclang="de-DE"></header>
4
+ <body>
5
+ <tu tuid="1">
6
+ <tuv xml:lang="en-US">
7
+ <seg>check</seg>
8
+ </tuv>
9
+ <tuv xml:lang="de-DE">
10
+ <seg>überprüfen</seg>
11
+ </tuv>
12
+ </tu>
13
+ <tu tuid="2">
14
+ <tuv xml:lang="en-US">
15
+ <seg>Backrest adjustment</seg>
16
+ </tuv>
17
+ <tuv xml:lang="de-DE">
18
+ <seg>Rückenlehneneinstellung</seg>
19
+ </tuv>
20
+ </tu>
21
+ <tu tuid="3">
22
+ <tuv xml:lang="en-US">
23
+ <seg>In terms of</seg>
24
+ </tuv>
25
+ <tuv xml:lang="de-DE">
26
+ <seg>Bezüglich</seg>
27
+ </tuv>
28
+ </tu>
29
+ <tu tuid="4">
30
+ <tuv xml:lang="en-US">
31
+ <seg>The state protection checks whether there is a connection with an attack on a planned refugee camp in the neighborhood of late August.</seg>
32
+ </tuv>
33
+ <tuv xml:lang="de-DE">
34
+ <seg>Der Staatsschutz prüft, ob es einen Zusammenhang mit einem Anschlag auf eine geplante Flüchtlingsunterkunft in der Nachbarschaft Ende August gibt.</seg>
35
+ </tuv>
36
+ </tu>
37
+ </body>
38
+ </tmx>
@@ -0,0 +1,44 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <tmx version="1.4">
3
+ <header creationtool="AwesomeTool" creationtoolversion="2.0" datatype="plaintext" segtype="sentence" o-tmf="AwesomeTool TMX" adminlang="EN-US" srclang="*all*"></header>
4
+ <body>
5
+ <tu tuid="1">
6
+ <tuv xml:lang="de-DE">
7
+ <seg>überprüfen</seg>
8
+ </tuv>
9
+ <tuv xml:lang="en-US">
10
+ <seg>check</seg>
11
+ </tuv>
12
+ <tuv xml:lang="it">
13
+ <seg>controllare</seg>
14
+ </tuv>
15
+ <tuv xml:lang="fr">
16
+ <seg>vérifier</seg>
17
+ </tuv>
18
+ </tu>
19
+ <tu tuid="2">
20
+ <tuv xml:lang="de-DE">
21
+ <seg>Rückenlehneneinstellung</seg>
22
+ </tuv>
23
+ <tuv xml:lang="en-US">
24
+ <seg>Backrest adjustment</seg>
25
+ </tuv>
26
+ </tu>
27
+ <tu tuid="3">
28
+ <tuv xml:lang="de-DE">
29
+ <seg>Bezüglich</seg>
30
+ </tuv>
31
+ <tuv xml:lang="en-US">
32
+ <seg>In terms of</seg>
33
+ </tuv>
34
+ </tu>
35
+ <tu tuid="4">
36
+ <tuv xml:lang="de-DE">
37
+ <seg>Der Staatsschutz prüft, ob es einen Zusammenhang mit einem Anschlag auf eine geplante Flüchtlingsunterkunft in der Nachbarschaft Ende August gibt.</seg>
38
+ </tuv>
39
+ <tuv xml:lang="en-US">
40
+ <seg>The state protection checks whether there is a connection with an attack on a planned refugee camp in the neighborhood of late August.</seg>
41
+ </tuv>
42
+ </tu>
43
+ </body>
44
+ </tmx>
@@ -0,0 +1,38 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <tmx version="1.4">
3
+ <header creationtool="AwesomeTool" creationtoolversion="2.0" datatype="plaintext" segtype="sentence" o-tmf="AwesomeTool TMX" adminlang="EN-US" srclang="de-DE"></header>
4
+ <body>
5
+ <tu tuid="1">
6
+ <tuv xml:lang="de-DE">
7
+ <seg>überprüfen</seg>
8
+ </tuv>
9
+ <tuv xml:lang="en-US">
10
+ <seg>check</seg>
11
+ </tuv>
12
+ </tu>
13
+ <tu tuid="2">
14
+ <tuv xml:lang="de-DE">
15
+ <seg>Rückenlehneneinstellung</seg>
16
+ </tuv>
17
+ <tuv xml:lang="en-US">
18
+ <seg>Backrest adjustment</seg>
19
+ </tuv>
20
+ </tu>
21
+ <tu tuid="3">
22
+ <tuv xml:lang="de-DE">
23
+ <seg>Bezüglich</seg>
24
+ </tuv>
25
+ <tuv xml:lang="en-US">
26
+ <seg>In terms of</seg>
27
+ </tuv>
28
+ </tu>
29
+ <tu tuid="4">
30
+ <tuv xml:lang="de-DE">
31
+ <seg>Der Staatsschutz prüft, ob es einen Zusammenhang mit einem Anschlag auf eine geplante Flüchtlingsunterkunft in der Nachbarschaft Ende August gibt.</seg>
32
+ </tuv>
33
+ <tuv xml:lang="en-US">
34
+ <seg>The state protection checks whether there is a connection with an attack on a planned refugee camp in the neighborhood of late August.</seg>
35
+ </tuv>
36
+ </tu>
37
+ </body>
38
+ </tmx>
@@ -0,0 +1,38 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <tmx version="1.4">
3
+ <header creationtool="AwesomeTool" creationtoolversion="2.0" datatype="plaintext" segtype="sentence" o-tmf="AwesomeTool TMX" adminlang="EN" srclang="de"></header>
4
+ <body>
5
+ <tu tuid="1">
6
+ <tuv xml:lang="de">
7
+ <seg>überprüfen</seg>
8
+ </tuv>
9
+ <tuv xml:lang="en">
10
+ <seg>check</seg>
11
+ </tuv>
12
+ </tu>
13
+ <tu tuid="2">
14
+ <tuv xml:lang="de">
15
+ <seg>Rückenlehneneinstellung</seg>
16
+ </tuv>
17
+ <tuv xml:lang="en">
18
+ <seg>Backrest adjustment</seg>
19
+ </tuv>
20
+ </tu>
21
+ <tu tuid="3">
22
+ <tuv xml:lang="de-DE">
23
+ <seg>Bezüglich</seg>
24
+ </tuv>
25
+ <tuv xml:lang="en">
26
+ <seg>In terms of</seg>
27
+ </tuv>
28
+ </tu>
29
+ <tu tuid="4">
30
+ <tuv xml:lang="de">
31
+ <seg>Der Staatsschutz prüft, ob es einen Zusammenhang mit einem Anschlag auf eine geplante Flüchtlingsunterkunft in der Nachbarschaft Ende August gibt.</seg>
32
+ </tuv>
33
+ <tuv xml:lang="en">
34
+ <seg>The state protection checks whether there is a connection with an attack on a planned refugee camp in the neighborhood of late August.</seg>
35
+ </tuv>
36
+ </tu>
37
+ </body>
38
+ </tmx>
@@ -0,0 +1,121 @@
1
+ require 'spec_helper'
2
+
3
+ describe TmxImporter do
4
+ it 'has a version number' do
5
+ expect(TmxImporter::VERSION).not_to be nil
6
+ end
7
+
8
+ it 'raises an error if the encoding is not supported' do
9
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/test_tm(utf-8).tmx')
10
+ -> { expect(TmxImporter::Tmx.new(file_path: file_path, encoding: 'ISO-8859-9').stats).to raise_error }
11
+ end
12
+
13
+ it 'raises an error if the wrong encoding is specified in the file' do
14
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/bad_encoding.tmx')
15
+ -> { expect(TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-8').stats).to raise_error }
16
+ end
17
+
18
+ it 'raises an error if the file contains bad markup' do
19
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/bad_markup(utf-8).tmx')
20
+ -> { expect(TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-8').stats).to raise_error }
21
+ end
22
+
23
+ it 'raises an error if the file contains bad markup 2' do
24
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/bad_markup(utf-16).tmx')
25
+ -> { expect(TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-16le').stats).to raise_error }
26
+ end
27
+
28
+ describe '#stats' do
29
+ it 'reports the stats of a UTF-8 TMX file' do
30
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/test_tm(utf-8).tmx')
31
+ tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-8')
32
+ expect(tmx.stats).to eq({:tu_count=>4, :seg_count=>8, :language_pairs=>[["de-DE", "en-US"]]})
33
+ end
34
+
35
+ it 'reports the stats of a UTF-8 TMX file 2' do
36
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/test_tm_2(utf-8).tmx')
37
+ tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-8')
38
+ expect(tmx.stats).to eq({:tu_count=>4, :seg_count=>8, :language_pairs=>[["de", "en"]]})
39
+ end
40
+
41
+ it 'reports the stats of a UTF-16LE TMX file' do
42
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/test_tm(utf-16LE).tmx')
43
+ tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-16le')
44
+ expect(tmx.stats).to eq({:tu_count=>4, :seg_count=>8, :language_pairs=>[["de-DE", "en-US"]]})
45
+ end
46
+
47
+ it 'reports the stats of a UTF-16LE BOM TMX file' do
48
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/test_tm(utf-16LE BOM).tmx')
49
+ tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-16le')
50
+ expect(tmx.stats).to eq({:tu_count=>4, :seg_count=>8, :language_pairs=>[["de-DE", "en-US"]]})
51
+ end
52
+
53
+ it 'reports the stats of a multiple language pair TMX file' do
54
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/multiple_language_pairs.tmx')
55
+ tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-8')
56
+ expect(tmx.stats).to eq({:tu_count=>4, :seg_count=>10, :language_pairs=>[["de-DE", "en-US"], ["de-DE", "it"], ["de-DE", "fr"]]})
57
+ end
58
+
59
+ it 'reports the stats of a srclang equals *all* TMX file' do
60
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/srclang_all.tmx')
61
+ tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-8')
62
+ expect(tmx.stats).to eq({:tu_count=>4, :seg_count=>10, :language_pairs=>[["de-DE", "en-US"], ["de-DE", "it"], ["de-DE", "fr"]]})
63
+ end
64
+
65
+ it 'reports the stats of a TMX file with out of order segments' do
66
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/out_of_order_segments.tmx')
67
+ tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-8')
68
+ expect(tmx.stats).to eq({:tu_count=>4, :seg_count=>8, :language_pairs=>[["de-DE", "en-US"]]})
69
+ end
70
+ end
71
+
72
+ describe '#import' do
73
+ it 'imports a UTF-8 TMX file' do
74
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/test_tm(utf-8).tmx')
75
+ tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-8').import
76
+ expect(tmx[1][2][3]).to eq("de-DE")
77
+ end
78
+
79
+ it 'imports a UTF-8 TMX file 2' do
80
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/test_tm_2(utf-8).tmx')
81
+ tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-8')
82
+ expect(tmx.import[1][2][4]).to eq("Rückenlehneneinstellung")
83
+ end
84
+
85
+ it 'imports a UTF-16LE TMX file' do
86
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/test_tm(utf-16LE).tmx')
87
+ tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-16le')
88
+ expect(tmx.import[1][3][4]).to eq("Backrest adjustment")
89
+ end
90
+
91
+ it 'imports a UTF-16LE BOM TMX file' do
92
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/test_tm(utf-16LE BOM).tmx')
93
+ tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-16le')
94
+ expect(tmx.import[1][2][3]).to eq("de-DE")
95
+ end
96
+
97
+ it 'imports a multiple language pair TMX file' do
98
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/multiple_language_pairs.tmx')
99
+ tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-8')
100
+ expect(tmx.import[1][2][3]).to eq("it")
101
+ end
102
+
103
+ it 'imports a srclang equals *all* TMX file' do
104
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/srclang_all.tmx')
105
+ tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-8')
106
+ expect(tmx.import[1][2][3]).to eq("it")
107
+ end
108
+
109
+ it 'imports a TMX file with out of order segments' do
110
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/out_of_order_segments.tmx')
111
+ tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-8')
112
+ expect(tmx.import[1][2][3]).to eq("en-US")
113
+ end
114
+
115
+ it 'imports a TMX file with out of order segments' do
116
+ file_path = File.expand_path('../tmx_importer/spec/test_sample_files/out_of_order_segments.tmx')
117
+ tmx = TmxImporter::Tmx.new(file_path: file_path, encoding: 'utf-8')
118
+ expect(tmx.import[1][2][1]).to eq("target")
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'tmx_importer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "tmx_importer"
8
+ spec.version = TmxImporter::VERSION
9
+ spec.authors = ["Kevin S. Dias"]
10
+ spec.email = ["diasks2@gmail.com"]
11
+
12
+ spec.summary = %q{TMX translation memory file importer}
13
+ spec.description = %q{Import the content of a TMX translation memory file to your database}
14
+ spec.homepage = "https://github.com/diasks2/tmx_importer"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.9"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_development_dependency "rspec"
24
+ spec.add_runtime_dependency "libxml-ruby"
25
+ spec.add_runtime_dependency "pretty_strings", "~> 0.5.0"
26
+ end
metadata ADDED
@@ -0,0 +1,147 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tmx_importer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Kevin S. Dias
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-03-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.9'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.9'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: libxml-ruby
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pretty_strings
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 0.5.0
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.5.0
83
+ description: Import the content of a TMX translation memory file to your database
84
+ email:
85
+ - diasks2@gmail.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - ".gitignore"
91
+ - ".rspec"
92
+ - ".travis.yml"
93
+ - Gemfile
94
+ - LICENSE.txt
95
+ - README.md
96
+ - Rakefile
97
+ - lib/tmx_importer.rb
98
+ - lib/tmx_importer/version.rb
99
+ - spec/spec_helper.rb
100
+ - spec/test_sample_files/bad_encoding.tmx
101
+ - spec/test_sample_files/bad_markup(utf-16).tmx
102
+ - spec/test_sample_files/bad_markup(utf-8).tmx
103
+ - spec/test_sample_files/multiple_language_pairs.tmx
104
+ - spec/test_sample_files/out_of_order_segments.tmx
105
+ - spec/test_sample_files/srclang_all.tmx
106
+ - spec/test_sample_files/test_tm(utf-16LE BOM).tmx
107
+ - spec/test_sample_files/test_tm(utf-16LE).tmx
108
+ - spec/test_sample_files/test_tm(utf-8).tmx
109
+ - spec/test_sample_files/test_tm_2(utf-8).tmx
110
+ - spec/tmx_importer_spec.rb
111
+ - tmx_importer.gemspec
112
+ homepage: https://github.com/diasks2/tmx_importer
113
+ licenses: []
114
+ metadata: {}
115
+ post_install_message:
116
+ rdoc_options: []
117
+ require_paths:
118
+ - lib
119
+ required_ruby_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ required_rubygems_version: !ruby/object:Gem::Requirement
125
+ requirements:
126
+ - - ">="
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ requirements: []
130
+ rubyforge_project:
131
+ rubygems_version: 2.4.1
132
+ signing_key:
133
+ specification_version: 4
134
+ summary: TMX translation memory file importer
135
+ test_files:
136
+ - spec/spec_helper.rb
137
+ - spec/test_sample_files/bad_encoding.tmx
138
+ - spec/test_sample_files/bad_markup(utf-16).tmx
139
+ - spec/test_sample_files/bad_markup(utf-8).tmx
140
+ - spec/test_sample_files/multiple_language_pairs.tmx
141
+ - spec/test_sample_files/out_of_order_segments.tmx
142
+ - spec/test_sample_files/srclang_all.tmx
143
+ - spec/test_sample_files/test_tm(utf-16LE BOM).tmx
144
+ - spec/test_sample_files/test_tm(utf-16LE).tmx
145
+ - spec/test_sample_files/test_tm(utf-8).tmx
146
+ - spec/test_sample_files/test_tm_2(utf-8).tmx
147
+ - spec/tmx_importer_spec.rb