konjak 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ module Konjak
2
+ # container
3
+ class StructuralElement < Element
4
+ end
5
+ end
@@ -0,0 +1,15 @@
1
+ module Konjak
2
+ class SubFlow < InlineElement
3
+ # optional attrs
4
+ attr_accessor :data_type, :type
5
+
6
+ # FIXME
7
+ # Text data,
8
+ # Zero, one or more of the following elements: <bpt>, <ept>, <it>, <ph>, and <hi>.
9
+ # They can be in any order, except that each <bpt> element must have a subsequent corresponding <ept> element.
10
+
11
+ def can_contain?(element)
12
+ [Text, BeginPairedTag, EndPairedTag, IsolatedTag, Placeholder, Hilight].any? {|c| c === element }
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ module Konjak
2
+ class Text
3
+ def initialize(text)
4
+ @text = text
5
+ end
6
+
7
+ def to_s
8
+ @text
9
+ end
10
+
11
+ def length
12
+ @text.length
13
+ end
14
+ end
15
+ end
data/lib/konjak/tmx.rb ADDED
@@ -0,0 +1,27 @@
1
+ module Konjak
2
+ class Tmx < StructuralElement
3
+ # required attrs
4
+ attr_accessor :version
5
+
6
+ def initialize(tmx)
7
+ @version = tmx[:version]
8
+ # TODO - better error handling
9
+ @header, @body = tmx.children
10
+ end
11
+
12
+ def header
13
+ Header.new @header
14
+ end
15
+
16
+ def body
17
+ Body.new @body
18
+ end
19
+
20
+ # FIXME
21
+ # One <header> followed by
22
+ # One <body> element.
23
+ def can_contain?(element)
24
+ Header === element || Body === element
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,40 @@
1
+ module Konjak
2
+ class TranslationUnit < StructuralElement
3
+ # optional attrs
4
+ attr_accessor :tuid, :o_encoding, :data_type, :usage_count, :last_usage_date
5
+ attr_accessor :creation_tool, :creation_tool_version, :creation_date
6
+ attr_accessor :creation_id, :change_date, :seg_type, :change_id, :o_tmf
7
+ attr_accessor :src_lang
8
+
9
+ # children
10
+ attr_accessor :variants
11
+
12
+ def initialize(tu)
13
+ # attrs
14
+ @tuid = tu[:tuid]
15
+ @data_type = tu[:datatype]
16
+ @usage_count = tu[:usagecount]
17
+ @last_usage_date = tu[:lastusagedate]
18
+ @src_lang = tu[:srclang]
19
+
20
+ # children
21
+ @variants = tu.children.select {|c| c.name == 'tuv' }.map {|tuv| TranslationUnitVariant.new tuv }
22
+ end
23
+
24
+ def can_contain?(element)
25
+ [Note, Property, TranslationUnitVariant].any? {|c| c === element }
26
+ end
27
+
28
+ # Logically, a complete translation-memory database will contain at least two <tuv> elements in each translation unit.
29
+ def complete?
30
+ child_elements.count {|e| TranslationUnitVariant === e } >= 2
31
+ end
32
+
33
+ # FIXME
34
+ # Zero, one or more <note>, or <prop> elements in any order, followed by
35
+ # One or more <tuv> elements.
36
+ def valid?
37
+ child_elements.count {|e| TranslationUnitVariant === e } >= 1
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,40 @@
1
+ module Konjak
2
+ class TranslationUnitVariant < StructuralElement
3
+ # required attrs
4
+ attr_accessor :xml_lang
5
+
6
+ # optional attrs
7
+ attr_accessor :o_encoding, :data_type, :usage_count, :last_usage_date
8
+ attr_accessor :creation_tool, :creation_tool_version, :creation_date
9
+ attr_accessor :creation_id, :change_date, :change_id, :o_tmf
10
+
11
+ # children
12
+ attr_accessor :notes, :properties, :segment
13
+
14
+ def initialize(tuv)
15
+ @xml_lang = tuv['xml:lang']
16
+ @o_encoding = tuv['o-encoding']
17
+ @data_type = tuv['datatype']
18
+ @usage_count = tuv['usagecount']
19
+ @last_usage_date = tuv['lastusagedate']
20
+ @creation_tool = tuv['creationtool']
21
+ @creation_tool_version = tuv['creationtoolversion']
22
+ @creation_date = tuv['creationdate']
23
+ @creation_id = tuv['creationid']
24
+ @change_date = tuv['changedate']
25
+ @change_id = tuv['changeid']
26
+ @o_tmf = tuv['o-tmf']
27
+
28
+ @notes = tuv.children.select {|c| c.name == 'note' }.map {|n| Note.new n }
29
+ @properties = tuv.children.select {|c| c.name == 'prop' }.map {|n| Property.new n }
30
+ @segment = Segment.new(tuv.children.detect {|c| c.name == 'seg' })
31
+ end
32
+
33
+ # FIXME
34
+ # Zero, one or more <note>, or <prop> elements in any order, followed by
35
+ # One <seg> element.
36
+ def can_contain?(element)
37
+ [Note, Property, Segment].any? {|c| c === element }
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,55 @@
1
+ require 'mem'
2
+
3
+ module Konjak
4
+ class Translator
5
+ include Mem
6
+
7
+ attr_reader :tmx, :src_lang, :target_lang
8
+
9
+ def initialize(tmx, src_lang, target_lang)
10
+ @tmx = tmx
11
+ @src_lang = src_lang
12
+ @target_lang = target_lang
13
+ end
14
+
15
+ def translate(doc)
16
+ translated_docs = [doc.dup]
17
+ translation_units.each do |tu|
18
+ s = tu.variants.detect { |v| v.xml_lang == src_lang }.segment.text.to_s
19
+ t = tu.variants.detect { |v| v.xml_lang == target_lang }.segment.text.to_s
20
+ translated_docs.map! { |d|
21
+ next d if d.respond_to?(:translated)
22
+ next d if !d.include?(s)
23
+
24
+ ds = []
25
+ tail = nil
26
+ loop do
27
+ head, match, tail = d.partition(s)
28
+ ds << head
29
+ ds << t.dup.tap {|t| def t.translated; true; end }
30
+
31
+ break unless tail.include?(s)
32
+
33
+ d = tail
34
+ end
35
+ ds << tail
36
+ ds
37
+ }.flatten!
38
+ end
39
+ translated_docs.join
40
+ end
41
+
42
+ private
43
+
44
+ def translation_units
45
+ tmx.body.translation_units.select { |tu|
46
+ (!tu.src_lang || tu.src_lang == src_lang || tu.src_lang == '*all*') &&
47
+ tu.variants.any? {|v| v.xml_lang == src_lang } &&
48
+ tu.variants.any? {|v| v.xml_lang == target_lang }
49
+ }.sort_by {|tu|
50
+ -tu.variants.detect { |v| v.xml_lang == src_lang }.segment.text.length
51
+ }
52
+ end
53
+ memoize :translation_units
54
+ end
55
+ end
@@ -0,0 +1,11 @@
1
+ module Konjak
2
+ # DEPRECATED
3
+ class UnknownTag < InlineElement
4
+ # optional attrs
5
+ attr_accessor :x
6
+
7
+ def can_contain?(element)
8
+ CodeData === element || SubFlow === element
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,24 @@
1
+ module Konjak
2
+ class UserDefinedEncoding < StructuralElement
3
+ # required attrs
4
+ attr_accessor :name
5
+
6
+ # FIXME
7
+ # base (required if one or more of the <map/> elements contains a code attribute).
8
+ # optional attrs
9
+ attr_accessor :base
10
+
11
+ # children
12
+ attr_accessor :maps
13
+
14
+ def initialize(ude)
15
+ @name = ude[:name]
16
+ @base = ude[:base]
17
+ @maps = ude.children.select {|c| c.name == 'map' }.map {|n| Map.new n }
18
+ end
19
+
20
+ def can_contain?(element)
21
+ Map === element
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,3 @@
1
+ module Konjak
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,92 @@
1
+ <?xml version="1.0" encoding="UTF-8" ?>
2
+ <tmx version="1.4">
3
+ <header creationtool="TMM" creationtoolversion="1.0" segtype="" o-tmf="" adminlang="" srclang="*all*" datatype=""/>
4
+ <body>
5
+ <tu tuid="T:en:ja:false:2040014087321606161:6682938220048076252:ja:13639662800514838096:en:;translation:REVIEWED:e2c10971e4aac54e:" creationtool="TMM" creationtoolversion="1.0" srclang="en">
6
+ <tuv xml:lang="en">
7
+ <seg>This is {0}example{/0}.</seg>
8
+ </tuv>
9
+ <tuv xml:lang="ja">
10
+ <seg>これは、 {0}例{/0} 。</seg>
11
+ </tuv>
12
+ <entry_metadata>
13
+ <tm_entry>
14
+ <source_info>
15
+ <source>This is {0}example{/0}.</source>
16
+ <source_lang>en</source_lang>
17
+ </source_info>
18
+ <translation_info status="REVIEWED">
19
+ <target_lang>ja</target_lang>
20
+ <tm_id>e2c10971e4aac54e</tm_id>
21
+ </translation_info>
22
+ <translation translator_id="560573848676" translation_timestamp="1431727112574000">
23
+ <translation_content>これは、 {0}例{/0} 。</translation_content>
24
+ </translation>
25
+ </tm_entry>
26
+ <context>rO0ABXVyAAJbQqzzF/gGCFTgAgAAeHAAAAAA</context>
27
+ </entry_metadata>
28
+ <translation_content_hash>beb0b69849aeabce9de9288dc8342565</translation_content_hash>
29
+ <entry_metadata_hash>9d8b6da8b9ad7c0d7234019e498b7121</entry_metadata_hash>
30
+ </tu>
31
+ <tu tuid="T:en:ja:false:8812459504862001208:7855750935865835385:ja:1953815444959955934:en:;translation:REVIEWED:e2c10971e4aac54e:" creationtool="TMM" creationtoolversion="1.0" srclang="en">
32
+ <tuv xml:lang="en">
33
+ <seg>
34
+
35
+ &amp;amp; it&amp;#39;s also example.</seg>
36
+ </tuv>
37
+ <tuv xml:lang="ja">
38
+ <seg>
39
+
40
+ &それはまた、例です。</seg>
41
+ </tuv>
42
+ <entry_metadata>
43
+ <tm_entry>
44
+ <source_info>
45
+ <source>
46
+
47
+ &amp;amp; it&amp;#39;s also example.</source>
48
+ <source_lang>en</source_lang>
49
+ </source_info>
50
+ <translation_info status="REVIEWED">
51
+ <target_lang>ja</target_lang>
52
+ <tm_id>e2c10971e4aac54e</tm_id>
53
+ </translation_info>
54
+ <translation translator_id="560573848676" translation_timestamp="1431728356543000">
55
+ <translation_content>
56
+
57
+ &それはまた、例です。</translation_content>
58
+ </translation>
59
+ </tm_entry>
60
+ <context>rO0ABXVyAAJbQqzzF/gGCFTgAgAAeHAAAAASChBUaGlzIGlzIGV4YW1wbGUu</context>
61
+ </entry_metadata>
62
+ <translation_content_hash>6f0afae9a7dfda8c90a75be0d392765e</translation_content_hash>
63
+ <entry_metadata_hash>7e5b6ab67076bbea9790c2031882cea8</entry_metadata_hash>
64
+ </tu>
65
+ <tu tuid="T:en:ja:false:2040014087321606161:6682938220048076252:ja:14373978183534318585:en:;translation:REVIEWED:e2c10971e4aac54e:" creationtool="TMM" creationtoolversion="1.0" srclang="en">
66
+ <tuv xml:lang="en">
67
+ <seg>This is {0}example{/0}.</seg>
68
+ </tuv>
69
+ <tuv xml:lang="ja">
70
+ <seg>これは、 {0}例{/0} 。</seg>
71
+ </tuv>
72
+ <entry_metadata>
73
+ <tm_entry>
74
+ <source_info>
75
+ <source>This is {0}example{/0}.</source>
76
+ <source_lang>en</source_lang>
77
+ </source_info>
78
+ <translation_info status="REVIEWED">
79
+ <target_lang>ja</target_lang>
80
+ <tm_id>e2c10971e4aac54e</tm_id>
81
+ </translation_info>
82
+ <translation translator_id="560573848676" translation_timestamp="1431728291642000">
83
+ <translation_content>これは、 {0}例{/0} 。</translation_content>
84
+ </translation>
85
+ </tm_entry>
86
+ <context>rO0ABXVyAAJbQqzzF/gGCFTgAgAAeHAAAAAYEhYKCiYgaXQncyBhbHNvIGV4YW1wbGUu</context>
87
+ </entry_metadata>
88
+ <translation_content_hash>beb0b69849aeabce9de9288dc8342565</translation_content_hash>
89
+ <entry_metadata_hash>ba7b0252962332aad36a9b14d6184121</entry_metadata_hash>
90
+ </tu>
91
+ </body>
92
+ </tmx>
@@ -0,0 +1,68 @@
1
+ <?xml version="1.0"?>
2
+ <!-- Example of TMX document -->
3
+ <tmx version="1.4">
4
+ <header
5
+ creationtool="XYZTool"
6
+ creationtoolversion="1.01-023"
7
+ datatype="PlainText"
8
+ segtype="sentence"
9
+ adminlang="en-us"
10
+ srclang="EN"
11
+ o-tmf="ABCTransMem"
12
+ creationdate="20020101T163812Z"
13
+ creationid="ThomasJ"
14
+ changedate="20020413T023401Z"
15
+ changeid="Amity"
16
+ o-encoding="iso-8859-1"
17
+ >
18
+ <note xml:lang="en" o-encoding="iso-8859-1">This is a note at document level.</note>
19
+ <prop xml:lang="en" o-encoding="iso-8859-1" type="RTFPreamble">{\rtf1\ansi\tag etc...{\fonttbl}</prop>
20
+ <ude name="MacRoman" base="Macintosh">
21
+ <map unicode="#xF8FF" code="#xF0" ent="Apple_logo" subst="[Apple]"/>
22
+ </ude>
23
+ </header>
24
+ <body>
25
+ <tu
26
+ tuid="0001"
27
+ datatype="Text"
28
+ usagecount="2"
29
+ lastusagedate="19970314T023401Z"
30
+ >
31
+ <note>Text of a note at the TU level.</note>
32
+ <prop type="x-Domain">Computing</prop>
33
+ <prop type="x-Project">P&#x00E6;gasus</prop>
34
+ <tuv
35
+ xml:lang="EN"
36
+ creationdate="19970212T153400Z"
37
+ creationid="BobW"
38
+ >
39
+ <seg>data (with a non-standard character: &#xF8FF;).</seg>
40
+ </tuv>
41
+ <tuv
42
+ xml:lang="FR-CA"
43
+ creationdate="19970309T021145Z"
44
+ creationid="BobW"
45
+ changedate="19970314T023401Z"
46
+ changeid="ManonD"
47
+ >
48
+ <prop type="Origin">MT</prop>
49
+ <seg>donn&#xE9;es (avec un caract&#xE8;re non standard: &#xF8FF;).</seg>
50
+ </tuv>
51
+ </tu>
52
+ <tu
53
+ tuid="0002"
54
+ srclang="*all*"
55
+ >
56
+ <prop type="Domain">Cooking</prop>
57
+ <tuv xml:lang="EN">
58
+ <seg>menu</seg>
59
+ </tuv>
60
+ <tuv xml:lang="FR-CA">
61
+ <seg>menu</seg>
62
+ </tuv>
63
+ <tuv xml:lang="FR-FR">
64
+ <seg>menu</seg>
65
+ </tuv>
66
+ </tu>
67
+ </body>
68
+ </tmx>
@@ -0,0 +1,179 @@
1
+ require 'spec_helper'
2
+
3
+ describe Konjak do
4
+ let(:sample_tmx) { File.read('spec/fixtures/sample.tmx') }
5
+
6
+ subject { Konjak.parse(sample_tmx) }
7
+
8
+ it { is_expected.to be_kind_of Konjak::Tmx }
9
+
10
+ its(:version) { is_expected.to eq '1.4' }
11
+
12
+ describe 'header' do
13
+ subject { super().header }
14
+
15
+ it { is_expected.to be_instance_of Konjak::Header }
16
+
17
+ its(:creation_tool) { is_expected.to eq 'XYZTool' }
18
+ its(:creation_tool_version) { is_expected.to eq '1.01-023' }
19
+ its(:data_type) { is_expected.to eq 'PlainText' }
20
+ its(:seg_type) { is_expected.to eq 'sentence' }
21
+ its(:admin_lang) { is_expected.to eq 'en-us' }
22
+ its(:src_lang) { is_expected.to eq 'EN' }
23
+ its(:o_tmf) { is_expected.to eq 'ABCTransMem' }
24
+ its(:creation_date) { is_expected.to eq '20020101T163812Z' }
25
+ its(:creation_id) { is_expected.to eq 'ThomasJ' }
26
+ its(:change_date) { is_expected.to eq '20020413T023401Z' }
27
+ its(:change_id) { is_expected.to eq 'Amity' }
28
+ its(:o_encoding) { is_expected.to eq 'iso-8859-1' }
29
+
30
+ describe 'notes' do
31
+ subject { super().notes }
32
+
33
+ its(:size) { is_expected.to eq 1 }
34
+ it { is_expected.to be_all {|n| n.instance_of? Konjak::Note } }
35
+
36
+ describe '.first' do
37
+ subject { super().first }
38
+
39
+ its(:xml_lang) { is_expected.to eq 'en' }
40
+ its(:o_encoding) { is_expected.to eq 'iso-8859-1' }
41
+ its(:text) { is_expected.to be_instance_of Konjak::Text }
42
+
43
+ describe 'text' do
44
+ subject { super().text }
45
+
46
+ its(:to_s) { is_expected.to eq 'This is a note at document level.' }
47
+ end
48
+ end
49
+ end
50
+
51
+ describe 'user_defined_encodings' do
52
+ subject { super().user_defined_encodings }
53
+
54
+ its(:size) { is_expected.to eq 1 }
55
+ it { is_expected.to be_all {|n| n.instance_of? Konjak::UserDefinedEncoding } }
56
+
57
+ describe '.first' do
58
+ subject { super().first }
59
+
60
+ its(:name) { is_expected.to eq 'MacRoman' }
61
+ its(:base) { is_expected.to eq 'Macintosh' }
62
+
63
+ describe '.map' do
64
+ subject { super().maps }
65
+
66
+ its(:size) { is_expected.to eq 1 }
67
+ it { is_expected.to be_all {|n| n.instance_of? Konjak::Map } }
68
+
69
+ describe '.first' do
70
+ subject { super().first }
71
+
72
+ its(:unicode) { is_expected.to eq '#xF8FF' }
73
+ its(:code) { is_expected.to eq '#xF0' }
74
+ its(:entity) { is_expected.to eq 'Apple_logo' }
75
+ its(:substitution) { is_expected.to eq '[Apple]' }
76
+ end
77
+ end
78
+ end
79
+ end
80
+
81
+ describe 'properties' do
82
+ subject { super().properties }
83
+
84
+ its(:size) { is_expected.to eq 1 }
85
+ it { is_expected.to be_all {|n| n.instance_of? Konjak::Property } }
86
+
87
+ describe '.first' do
88
+ subject { super().first }
89
+
90
+ its(:xml_lang) { is_expected.to eq 'en' }
91
+ its(:o_encoding) { is_expected.to eq 'iso-8859-1' }
92
+ its(:type) { is_expected.to eq 'RTFPreamble' }
93
+ its(:text) { is_expected.to be_instance_of Konjak::Text }
94
+
95
+ describe '.text' do
96
+ subject { super().text }
97
+
98
+ its(:to_s) { is_expected.to eq '{\rtf1\ansi\tag etc...{\fonttbl}' }
99
+ end
100
+ end
101
+ end
102
+ end
103
+
104
+ describe 'body' do
105
+ subject { super().body }
106
+
107
+ it { is_expected.to be_instance_of Konjak::Body }
108
+
109
+ describe 'translation_units' do
110
+ subject { super().translation_units }
111
+
112
+ its(:size) { is_expected.to eq 2 }
113
+ it { is_expected.to be_all {|tu| tu.instance_of? Konjak::TranslationUnit } }
114
+
115
+ describe 'translation unit 0001' do
116
+ subject { super().detect {|tu| tu.tuid == '0001' } }
117
+
118
+ its(:tuid) { is_expected.to eq '0001' }
119
+ its(:data_type) { is_expected.to eq 'Text' }
120
+ its(:usage_count) { is_expected.to eq '2' }
121
+ its(:last_usage_date) { is_expected.to eq '19970314T023401Z' }
122
+
123
+ its('variants.size') { is_expected.to eq 2 }
124
+ its(:variants) { is_expected.to be_all {|tuv| tuv.instance_of? Konjak::TranslationUnitVariant } }
125
+
126
+ describe '.variants.last' do
127
+ subject { super().variants.last }
128
+
129
+ its(:xml_lang) { is_expected.to eq 'FR-CA' }
130
+ its(:creation_date) { is_expected.to eq '19970309T021145Z' }
131
+ its(:creation_id) { is_expected.to eq 'BobW' }
132
+ its(:change_date) { is_expected.to eq '19970314T023401Z' }
133
+ its(:change_id) { is_expected.to eq 'ManonD' }
134
+
135
+ its(:notes) { is_expected.to be_all {|note| note.instance_of? Konjak::Note } }
136
+ its(:notes) { is_expected.to be_empty }
137
+ its(:properties) { is_expected.to be_all {|prop| prop.instance_of? Konjak::Property } }
138
+ its('properties.size') { is_expected.to eq 1 }
139
+ its(:segment) { is_expected.to be_instance_of Konjak::Segment }
140
+
141
+ describe '.segment' do
142
+ subject { super().segment }
143
+
144
+ its(:text) { is_expected.to be_instance_of Konjak::Text }
145
+
146
+ describe '.text' do
147
+ subject { super().text }
148
+
149
+ its(:to_s) { is_expected.to eq "donn\u00E9es (avec un caract\u00E8re non standard: \uF8FF)." }
150
+ end
151
+ end
152
+ end
153
+ end
154
+ describe 'translation unit 0002' do
155
+ subject { super().detect {|tu| tu.tuid == '0002' } }
156
+
157
+ its(:src_lang) { is_expected.to eq '*all*' }
158
+ end
159
+ end
160
+ end
161
+
162
+ describe 'gtt' do
163
+ let(:xml) { File.read('spec/fixtures/gtt.tmx') }
164
+
165
+ subject { tmx.body.translation_units.detect {|tu| tu.variants.detect {|v| v.segment.text.to_s == "\n\n& it's also example." } } }
166
+
167
+ context 'gtt: true' do
168
+ let(:tmx) { Konjak.parse(xml, gtt: true) }
169
+
170
+ it { is_expected.to be_truthy }
171
+ end
172
+
173
+ context 'gtt: false' do
174
+ let(:tmx) { Konjak.parse(xml, gtt: false) }
175
+
176
+ it { is_expected.to be_falsey}
177
+ end
178
+ end
179
+ end