konjak 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ module Konjak
2
+ # container
3
+ class StructuralElement < Element
4
+ end
5
+ end
@@ -0,0 +1,15 @@
1
+ module Konjak
2
+ class SubFlow < InlineElement
3
+ # optional attrs
4
+ attr_accessor :data_type, :type
5
+
6
+ # FIXME
7
+ # Text data,
8
+ # Zero, one or more of the following elements: <bpt>, <ept>, <it>, <ph>, and <hi>.
9
+ # They can be in any order, except that each <bpt> element must have a subsequent corresponding <ept> element.
10
+
11
+ def can_contain?(element)
12
+ [Text, BeginPairedTag, EndPairedTag, IsolatedTag, Placeholder, Hilight].any? {|c| c === element }
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ module Konjak
2
+ class Text
3
+ def initialize(text)
4
+ @text = text
5
+ end
6
+
7
+ def to_s
8
+ @text
9
+ end
10
+
11
+ def length
12
+ @text.length
13
+ end
14
+ end
15
+ end
data/lib/konjak/tmx.rb ADDED
@@ -0,0 +1,27 @@
1
+ module Konjak
2
+ class Tmx < StructuralElement
3
+ # required attrs
4
+ attr_accessor :version
5
+
6
+ def initialize(tmx)
7
+ @version = tmx[:version]
8
+ # TODO - better error handling
9
+ @header, @body = tmx.children
10
+ end
11
+
12
+ def header
13
+ Header.new @header
14
+ end
15
+
16
+ def body
17
+ Body.new @body
18
+ end
19
+
20
+ # FIXME
21
+ # One <header> followed by
22
+ # One <body> element.
23
+ def can_contain?(element)
24
+ Header === element || Body === element
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,40 @@
1
+ module Konjak
2
+ class TranslationUnit < StructuralElement
3
+ # optional attrs
4
+ attr_accessor :tuid, :o_encoding, :data_type, :usage_count, :last_usage_date
5
+ attr_accessor :creation_tool, :creation_tool_version, :creation_date
6
+ attr_accessor :creation_id, :change_date, :seg_type, :change_id, :o_tmf
7
+ attr_accessor :src_lang
8
+
9
+ # children
10
+ attr_accessor :variants
11
+
12
+ def initialize(tu)
13
+ # attrs
14
+ @tuid = tu[:tuid]
15
+ @data_type = tu[:datatype]
16
+ @usage_count = tu[:usagecount]
17
+ @last_usage_date = tu[:lastusagedate]
18
+ @src_lang = tu[:srclang]
19
+
20
+ # children
21
+ @variants = tu.children.select {|c| c.name == 'tuv' }.map {|tuv| TranslationUnitVariant.new tuv }
22
+ end
23
+
24
+ def can_contain?(element)
25
+ [Note, Property, TranslationUnitVariant].any? {|c| c === element }
26
+ end
27
+
28
+ # Logically, a complete translation-memory database will contain at least two <tuv> elements in each translation unit.
29
+ def complete?
30
+ child_elements.count {|e| TranslationUnitVariant === e } >= 2
31
+ end
32
+
33
+ # FIXME
34
+ # Zero, one or more <note>, or <prop> elements in any order, followed by
35
+ # One or more <tuv> elements.
36
+ def valid?
37
+ child_elements.count {|e| TranslationUnitVariant === e } >= 1
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,40 @@
1
+ module Konjak
2
+ class TranslationUnitVariant < StructuralElement
3
+ # required attrs
4
+ attr_accessor :xml_lang
5
+
6
+ # optional attrs
7
+ attr_accessor :o_encoding, :data_type, :usage_count, :last_usage_date
8
+ attr_accessor :creation_tool, :creation_tool_version, :creation_date
9
+ attr_accessor :creation_id, :change_date, :change_id, :o_tmf
10
+
11
+ # children
12
+ attr_accessor :notes, :properties, :segment
13
+
14
+ def initialize(tuv)
15
+ @xml_lang = tuv['xml:lang']
16
+ @o_encoding = tuv['o-encoding']
17
+ @data_type = tuv['datatype']
18
+ @usage_count = tuv['usagecount']
19
+ @last_usage_date = tuv['lastusagedate']
20
+ @creation_tool = tuv['creationtool']
21
+ @creation_tool_version = tuv['creationtoolversion']
22
+ @creation_date = tuv['creationdate']
23
+ @creation_id = tuv['creationid']
24
+ @change_date = tuv['changedate']
25
+ @change_id = tuv['changeid']
26
+ @o_tmf = tuv['o-tmf']
27
+
28
+ @notes = tuv.children.select {|c| c.name == 'note' }.map {|n| Note.new n }
29
+ @properties = tuv.children.select {|c| c.name == 'prop' }.map {|n| Property.new n }
30
+ @segment = Segment.new(tuv.children.detect {|c| c.name == 'seg' })
31
+ end
32
+
33
+ # FIXME
34
+ # Zero, one or more <note>, or <prop> elements in any order, followed by
35
+ # One <seg> element.
36
+ def can_contain?(element)
37
+ [Note, Property, Segment].any? {|c| c === element }
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,55 @@
1
+ require 'mem'
2
+
3
+ module Konjak
4
+ class Translator
5
+ include Mem
6
+
7
+ attr_reader :tmx, :src_lang, :target_lang
8
+
9
+ def initialize(tmx, src_lang, target_lang)
10
+ @tmx = tmx
11
+ @src_lang = src_lang
12
+ @target_lang = target_lang
13
+ end
14
+
15
+ def translate(doc)
16
+ translated_docs = [doc.dup]
17
+ translation_units.each do |tu|
18
+ s = tu.variants.detect { |v| v.xml_lang == src_lang }.segment.text.to_s
19
+ t = tu.variants.detect { |v| v.xml_lang == target_lang }.segment.text.to_s
20
+ translated_docs.map! { |d|
21
+ next d if d.respond_to?(:translated)
22
+ next d if !d.include?(s)
23
+
24
+ ds = []
25
+ tail = nil
26
+ loop do
27
+ head, match, tail = d.partition(s)
28
+ ds << head
29
+ ds << t.dup.tap {|t| def t.translated; true; end }
30
+
31
+ break unless tail.include?(s)
32
+
33
+ d = tail
34
+ end
35
+ ds << tail
36
+ ds
37
+ }.flatten!
38
+ end
39
+ translated_docs.join
40
+ end
41
+
42
+ private
43
+
44
+ def translation_units
45
+ tmx.body.translation_units.select { |tu|
46
+ (!tu.src_lang || tu.src_lang == src_lang || tu.src_lang == '*all*') &&
47
+ tu.variants.any? {|v| v.xml_lang == src_lang } &&
48
+ tu.variants.any? {|v| v.xml_lang == target_lang }
49
+ }.sort_by {|tu|
50
+ -tu.variants.detect { |v| v.xml_lang == src_lang }.segment.text.length
51
+ }
52
+ end
53
+ memoize :translation_units
54
+ end
55
+ end
@@ -0,0 +1,11 @@
1
+ module Konjak
2
+ # DEPRECATED
3
+ class UnknownTag < InlineElement
4
+ # optional attrs
5
+ attr_accessor :x
6
+
7
+ def can_contain?(element)
8
+ CodeData === element || SubFlow === element
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,24 @@
1
+ module Konjak
2
+ class UserDefinedEncoding < StructuralElement
3
+ # required attrs
4
+ attr_accessor :name
5
+
6
+ # FIXME
7
+ # base (required if one or more of the <map/> elements contains a code attribute).
8
+ # optional attrs
9
+ attr_accessor :base
10
+
11
+ # children
12
+ attr_accessor :maps
13
+
14
+ def initialize(ude)
15
+ @name = ude[:name]
16
+ @base = ude[:base]
17
+ @maps = ude.children.select {|c| c.name == 'map' }.map {|n| Map.new n }
18
+ end
19
+
20
+ def can_contain?(element)
21
+ Map === element
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,3 @@
1
+ module Konjak
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,92 @@
1
+ <?xml version="1.0" encoding="UTF-8" ?>
2
+ <tmx version="1.4">
3
+ <header creationtool="TMM" creationtoolversion="1.0" segtype="" o-tmf="" adminlang="" srclang="*all*" datatype=""/>
4
+ <body>
5
+ <tu tuid="T:en:ja:false:2040014087321606161:6682938220048076252:ja:13639662800514838096:en:;translation:REVIEWED:e2c10971e4aac54e:" creationtool="TMM" creationtoolversion="1.0" srclang="en">
6
+ <tuv xml:lang="en">
7
+ <seg>This is {0}example{/0}.</seg>
8
+ </tuv>
9
+ <tuv xml:lang="ja">
10
+ <seg>これは、 {0}例{/0} 。</seg>
11
+ </tuv>
12
+ <entry_metadata>
13
+ <tm_entry>
14
+ <source_info>
15
+ <source>This is {0}example{/0}.</source>
16
+ <source_lang>en</source_lang>
17
+ </source_info>
18
+ <translation_info status="REVIEWED">
19
+ <target_lang>ja</target_lang>
20
+ <tm_id>e2c10971e4aac54e</tm_id>
21
+ </translation_info>
22
+ <translation translator_id="560573848676" translation_timestamp="1431727112574000">
23
+ <translation_content>これは、 {0}例{/0} 。</translation_content>
24
+ </translation>
25
+ </tm_entry>
26
+ <context>rO0ABXVyAAJbQqzzF/gGCFTgAgAAeHAAAAAA</context>
27
+ </entry_metadata>
28
+ <translation_content_hash>beb0b69849aeabce9de9288dc8342565</translation_content_hash>
29
+ <entry_metadata_hash>9d8b6da8b9ad7c0d7234019e498b7121</entry_metadata_hash>
30
+ </tu>
31
+ <tu tuid="T:en:ja:false:8812459504862001208:7855750935865835385:ja:1953815444959955934:en:;translation:REVIEWED:e2c10971e4aac54e:" creationtool="TMM" creationtoolversion="1.0" srclang="en">
32
+ <tuv xml:lang="en">
33
+ <seg>
34
+
35
+ &amp;amp; it&amp;#39;s also example.</seg>
36
+ </tuv>
37
+ <tuv xml:lang="ja">
38
+ <seg>
39
+
40
+ &それはまた、例です。</seg>
41
+ </tuv>
42
+ <entry_metadata>
43
+ <tm_entry>
44
+ <source_info>
45
+ <source>
46
+
47
+ &amp;amp; it&amp;#39;s also example.</source>
48
+ <source_lang>en</source_lang>
49
+ </source_info>
50
+ <translation_info status="REVIEWED">
51
+ <target_lang>ja</target_lang>
52
+ <tm_id>e2c10971e4aac54e</tm_id>
53
+ </translation_info>
54
+ <translation translator_id="560573848676" translation_timestamp="1431728356543000">
55
+ <translation_content>
56
+
57
+ &それはまた、例です。</translation_content>
58
+ </translation>
59
+ </tm_entry>
60
+ <context>rO0ABXVyAAJbQqzzF/gGCFTgAgAAeHAAAAASChBUaGlzIGlzIGV4YW1wbGUu</context>
61
+ </entry_metadata>
62
+ <translation_content_hash>6f0afae9a7dfda8c90a75be0d392765e</translation_content_hash>
63
+ <entry_metadata_hash>7e5b6ab67076bbea9790c2031882cea8</entry_metadata_hash>
64
+ </tu>
65
+ <tu tuid="T:en:ja:false:2040014087321606161:6682938220048076252:ja:14373978183534318585:en:;translation:REVIEWED:e2c10971e4aac54e:" creationtool="TMM" creationtoolversion="1.0" srclang="en">
66
+ <tuv xml:lang="en">
67
+ <seg>This is {0}example{/0}.</seg>
68
+ </tuv>
69
+ <tuv xml:lang="ja">
70
+ <seg>これは、 {0}例{/0} 。</seg>
71
+ </tuv>
72
+ <entry_metadata>
73
+ <tm_entry>
74
+ <source_info>
75
+ <source>This is {0}example{/0}.</source>
76
+ <source_lang>en</source_lang>
77
+ </source_info>
78
+ <translation_info status="REVIEWED">
79
+ <target_lang>ja</target_lang>
80
+ <tm_id>e2c10971e4aac54e</tm_id>
81
+ </translation_info>
82
+ <translation translator_id="560573848676" translation_timestamp="1431728291642000">
83
+ <translation_content>これは、 {0}例{/0} 。</translation_content>
84
+ </translation>
85
+ </tm_entry>
86
+ <context>rO0ABXVyAAJbQqzzF/gGCFTgAgAAeHAAAAAYEhYKCiYgaXQncyBhbHNvIGV4YW1wbGUu</context>
87
+ </entry_metadata>
88
+ <translation_content_hash>beb0b69849aeabce9de9288dc8342565</translation_content_hash>
89
+ <entry_metadata_hash>ba7b0252962332aad36a9b14d6184121</entry_metadata_hash>
90
+ </tu>
91
+ </body>
92
+ </tmx>
@@ -0,0 +1,68 @@
1
+ <?xml version="1.0"?>
2
+ <!-- Example of TMX document -->
3
+ <tmx version="1.4">
4
+ <header
5
+ creationtool="XYZTool"
6
+ creationtoolversion="1.01-023"
7
+ datatype="PlainText"
8
+ segtype="sentence"
9
+ adminlang="en-us"
10
+ srclang="EN"
11
+ o-tmf="ABCTransMem"
12
+ creationdate="20020101T163812Z"
13
+ creationid="ThomasJ"
14
+ changedate="20020413T023401Z"
15
+ changeid="Amity"
16
+ o-encoding="iso-8859-1"
17
+ >
18
+ <note xml:lang="en" o-encoding="iso-8859-1">This is a note at document level.</note>
19
+ <prop xml:lang="en" o-encoding="iso-8859-1" type="RTFPreamble">{\rtf1\ansi\tag etc...{\fonttbl}</prop>
20
+ <ude name="MacRoman" base="Macintosh">
21
+ <map unicode="#xF8FF" code="#xF0" ent="Apple_logo" subst="[Apple]"/>
22
+ </ude>
23
+ </header>
24
+ <body>
25
+ <tu
26
+ tuid="0001"
27
+ datatype="Text"
28
+ usagecount="2"
29
+ lastusagedate="19970314T023401Z"
30
+ >
31
+ <note>Text of a note at the TU level.</note>
32
+ <prop type="x-Domain">Computing</prop>
33
+ <prop type="x-Project">P&#x00E6;gasus</prop>
34
+ <tuv
35
+ xml:lang="EN"
36
+ creationdate="19970212T153400Z"
37
+ creationid="BobW"
38
+ >
39
+ <seg>data (with a non-standard character: &#xF8FF;).</seg>
40
+ </tuv>
41
+ <tuv
42
+ xml:lang="FR-CA"
43
+ creationdate="19970309T021145Z"
44
+ creationid="BobW"
45
+ changedate="19970314T023401Z"
46
+ changeid="ManonD"
47
+ >
48
+ <prop type="Origin">MT</prop>
49
+ <seg>donn&#xE9;es (avec un caract&#xE8;re non standard: &#xF8FF;).</seg>
50
+ </tuv>
51
+ </tu>
52
+ <tu
53
+ tuid="0002"
54
+ srclang="*all*"
55
+ >
56
+ <prop type="Domain">Cooking</prop>
57
+ <tuv xml:lang="EN">
58
+ <seg>menu</seg>
59
+ </tuv>
60
+ <tuv xml:lang="FR-CA">
61
+ <seg>menu</seg>
62
+ </tuv>
63
+ <tuv xml:lang="FR-FR">
64
+ <seg>menu</seg>
65
+ </tuv>
66
+ </tu>
67
+ </body>
68
+ </tmx>
@@ -0,0 +1,179 @@
1
+ require 'spec_helper'
2
+
3
+ describe Konjak do
4
+ let(:sample_tmx) { File.read('spec/fixtures/sample.tmx') }
5
+
6
+ subject { Konjak.parse(sample_tmx) }
7
+
8
+ it { is_expected.to be_kind_of Konjak::Tmx }
9
+
10
+ its(:version) { is_expected.to eq '1.4' }
11
+
12
+ describe 'header' do
13
+ subject { super().header }
14
+
15
+ it { is_expected.to be_instance_of Konjak::Header }
16
+
17
+ its(:creation_tool) { is_expected.to eq 'XYZTool' }
18
+ its(:creation_tool_version) { is_expected.to eq '1.01-023' }
19
+ its(:data_type) { is_expected.to eq 'PlainText' }
20
+ its(:seg_type) { is_expected.to eq 'sentence' }
21
+ its(:admin_lang) { is_expected.to eq 'en-us' }
22
+ its(:src_lang) { is_expected.to eq 'EN' }
23
+ its(:o_tmf) { is_expected.to eq 'ABCTransMem' }
24
+ its(:creation_date) { is_expected.to eq '20020101T163812Z' }
25
+ its(:creation_id) { is_expected.to eq 'ThomasJ' }
26
+ its(:change_date) { is_expected.to eq '20020413T023401Z' }
27
+ its(:change_id) { is_expected.to eq 'Amity' }
28
+ its(:o_encoding) { is_expected.to eq 'iso-8859-1' }
29
+
30
+ describe 'notes' do
31
+ subject { super().notes }
32
+
33
+ its(:size) { is_expected.to eq 1 }
34
+ it { is_expected.to be_all {|n| n.instance_of? Konjak::Note } }
35
+
36
+ describe '.first' do
37
+ subject { super().first }
38
+
39
+ its(:xml_lang) { is_expected.to eq 'en' }
40
+ its(:o_encoding) { is_expected.to eq 'iso-8859-1' }
41
+ its(:text) { is_expected.to be_instance_of Konjak::Text }
42
+
43
+ describe 'text' do
44
+ subject { super().text }
45
+
46
+ its(:to_s) { is_expected.to eq 'This is a note at document level.' }
47
+ end
48
+ end
49
+ end
50
+
51
+ describe 'user_defined_encodings' do
52
+ subject { super().user_defined_encodings }
53
+
54
+ its(:size) { is_expected.to eq 1 }
55
+ it { is_expected.to be_all {|n| n.instance_of? Konjak::UserDefinedEncoding } }
56
+
57
+ describe '.first' do
58
+ subject { super().first }
59
+
60
+ its(:name) { is_expected.to eq 'MacRoman' }
61
+ its(:base) { is_expected.to eq 'Macintosh' }
62
+
63
+ describe '.map' do
64
+ subject { super().maps }
65
+
66
+ its(:size) { is_expected.to eq 1 }
67
+ it { is_expected.to be_all {|n| n.instance_of? Konjak::Map } }
68
+
69
+ describe '.first' do
70
+ subject { super().first }
71
+
72
+ its(:unicode) { is_expected.to eq '#xF8FF' }
73
+ its(:code) { is_expected.to eq '#xF0' }
74
+ its(:entity) { is_expected.to eq 'Apple_logo' }
75
+ its(:substitution) { is_expected.to eq '[Apple]' }
76
+ end
77
+ end
78
+ end
79
+ end
80
+
81
+ describe 'properties' do
82
+ subject { super().properties }
83
+
84
+ its(:size) { is_expected.to eq 1 }
85
+ it { is_expected.to be_all {|n| n.instance_of? Konjak::Property } }
86
+
87
+ describe '.first' do
88
+ subject { super().first }
89
+
90
+ its(:xml_lang) { is_expected.to eq 'en' }
91
+ its(:o_encoding) { is_expected.to eq 'iso-8859-1' }
92
+ its(:type) { is_expected.to eq 'RTFPreamble' }
93
+ its(:text) { is_expected.to be_instance_of Konjak::Text }
94
+
95
+ describe '.text' do
96
+ subject { super().text }
97
+
98
+ its(:to_s) { is_expected.to eq '{\rtf1\ansi\tag etc...{\fonttbl}' }
99
+ end
100
+ end
101
+ end
102
+ end
103
+
104
+ describe 'body' do
105
+ subject { super().body }
106
+
107
+ it { is_expected.to be_instance_of Konjak::Body }
108
+
109
+ describe 'translation_units' do
110
+ subject { super().translation_units }
111
+
112
+ its(:size) { is_expected.to eq 2 }
113
+ it { is_expected.to be_all {|tu| tu.instance_of? Konjak::TranslationUnit } }
114
+
115
+ describe 'translation unit 0001' do
116
+ subject { super().detect {|tu| tu.tuid == '0001' } }
117
+
118
+ its(:tuid) { is_expected.to eq '0001' }
119
+ its(:data_type) { is_expected.to eq 'Text' }
120
+ its(:usage_count) { is_expected.to eq '2' }
121
+ its(:last_usage_date) { is_expected.to eq '19970314T023401Z' }
122
+
123
+ its('variants.size') { is_expected.to eq 2 }
124
+ its(:variants) { is_expected.to be_all {|tuv| tuv.instance_of? Konjak::TranslationUnitVariant } }
125
+
126
+ describe '.variants.last' do
127
+ subject { super().variants.last }
128
+
129
+ its(:xml_lang) { is_expected.to eq 'FR-CA' }
130
+ its(:creation_date) { is_expected.to eq '19970309T021145Z' }
131
+ its(:creation_id) { is_expected.to eq 'BobW' }
132
+ its(:change_date) { is_expected.to eq '19970314T023401Z' }
133
+ its(:change_id) { is_expected.to eq 'ManonD' }
134
+
135
+ its(:notes) { is_expected.to be_all {|note| note.instance_of? Konjak::Note } }
136
+ its(:notes) { is_expected.to be_empty }
137
+ its(:properties) { is_expected.to be_all {|prop| prop.instance_of? Konjak::Property } }
138
+ its('properties.size') { is_expected.to eq 1 }
139
+ its(:segment) { is_expected.to be_instance_of Konjak::Segment }
140
+
141
+ describe '.segment' do
142
+ subject { super().segment }
143
+
144
+ its(:text) { is_expected.to be_instance_of Konjak::Text }
145
+
146
+ describe '.text' do
147
+ subject { super().text }
148
+
149
+ its(:to_s) { is_expected.to eq "donn\u00E9es (avec un caract\u00E8re non standard: \uF8FF)." }
150
+ end
151
+ end
152
+ end
153
+ end
154
+ describe 'translation unit 0002' do
155
+ subject { super().detect {|tu| tu.tuid == '0002' } }
156
+
157
+ its(:src_lang) { is_expected.to eq '*all*' }
158
+ end
159
+ end
160
+ end
161
+
162
+ describe 'gtt' do
163
+ let(:xml) { File.read('spec/fixtures/gtt.tmx') }
164
+
165
+ subject { tmx.body.translation_units.detect {|tu| tu.variants.detect {|v| v.segment.text.to_s == "\n\n& it's also example." } } }
166
+
167
+ context 'gtt: true' do
168
+ let(:tmx) { Konjak.parse(xml, gtt: true) }
169
+
170
+ it { is_expected.to be_truthy }
171
+ end
172
+
173
+ context 'gtt: false' do
174
+ let(:tmx) { Konjak.parse(xml, gtt: false) }
175
+
176
+ it { is_expected.to be_falsey}
177
+ end
178
+ end
179
+ end