konjak 0.0.14 → 0.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/konjak/body.rb +3 -1
- data/lib/konjak/element.rb +3 -0
- data/lib/konjak/header.rb +5 -3
- data/lib/konjak/highlight.rb +1 -1
- data/lib/konjak/html_segmentor.rb +8 -7
- data/lib/konjak/map.rb +2 -0
- data/lib/konjak/note.rb +4 -2
- data/lib/konjak/polytex_segmentor.rb +23 -20
- data/lib/konjak/property.rb +4 -2
- data/lib/konjak/segment/gtt.rb +0 -1
- data/lib/konjak/segment.rb +8 -5
- data/lib/konjak/sub_flow.rb +1 -1
- data/lib/konjak/tmx.rb +2 -2
- data/lib/konjak/tmx_segmentor/strategy.rb +9 -5
- data/lib/konjak/translation_unit.rb +4 -1
- data/lib/konjak/translation_unit_variant.rb +6 -3
- data/lib/konjak/user_defined_encoding.rb +3 -1
- data/lib/konjak/version.rb +1 -1
- data/lib/konjak.rb +0 -1
- data/spec/konjak_parse_spec.rb +3 -3
- metadata +2 -4
- data/lib/konjak/text.rb +0 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 219a842656b4743ac6f782fdca2167dcf85e51ab
|
4
|
+
data.tar.gz: 2a1c74e8771791b5ac9dab000a7d556f88a14335
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f26891f463c259af8eee10698208a6df50c92b49c5b0f0a5065feadcccd10fe45733b8598eec5e47d35286feb519156aa8b82600576eeccd2d6cf39ecaf29daf
|
7
|
+
data.tar.gz: c88fe8b6fdbab45f4f5ddcc8c851869e47b62fc7bfc800676dba0f63352e193ffb453b15f95244eb364398b5c841629f2ce2d9df5b9edeca2a7b23e7c5c8caad
|
data/lib/konjak/body.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
module Konjak
|
2
2
|
class Body < StructuralElement
|
3
|
+
TAG_NAME = 'body'
|
4
|
+
|
3
5
|
# childrens
|
4
6
|
def translation_units
|
5
|
-
children.select {|c| c.name ==
|
7
|
+
children.select {|c| c.name == TranslationUnit::TAG_NAME }.map! {|tu| TranslationUnit.new(tu) }
|
6
8
|
end
|
7
9
|
|
8
10
|
# methods
|
data/lib/konjak/element.rb
CHANGED
data/lib/konjak/header.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
module Konjak
|
2
2
|
class Header < StructuralElement
|
3
|
+
TAG_NAME = 'header'
|
4
|
+
|
3
5
|
# required attrs
|
4
6
|
tmx_attr_accessor(:creation_tool, :creationtool, required: true)
|
5
7
|
tmx_attr_accessor(:creation_tool_version, :creationtoolversion, required: true)
|
@@ -18,15 +20,15 @@ module Konjak
|
|
18
20
|
|
19
21
|
# childrens
|
20
22
|
def notes
|
21
|
-
children.select {|c| c.name ==
|
23
|
+
children.select {|c| c.name == Note::TAG_NAME }.map! {|n| Note.new(n) }
|
22
24
|
end
|
23
25
|
|
24
26
|
def user_defined_encodings
|
25
|
-
children.select {|c| c.name ==
|
27
|
+
children.select {|c| c.name == UserDefinedEncoding::TAG_NAME }.map! {|n| UserDefinedEncoding.new(n) }
|
26
28
|
end
|
27
29
|
|
28
30
|
def properties
|
29
|
-
children.select {|c| c.name ==
|
31
|
+
children.select {|c| c.name == Property::TAG_NAME }.map! {|n| Property.new(n) }
|
30
32
|
end
|
31
33
|
|
32
34
|
# methods
|
data/lib/konjak/highlight.rb
CHANGED
@@ -11,7 +11,7 @@ module Konjak
|
|
11
11
|
# Zero, one or more of the following elements: <bpt>, <ept>, <it>, <ph>, and <hi>.
|
12
12
|
# They can be in any order, except that each <bpt> element must have a subsequent corresponding <ept> element.
|
13
13
|
def can_contain?(element)
|
14
|
-
[
|
14
|
+
[String, BeginPairedTag, EndPairedTag, IsolatedTag, Placeholder, Hilight].any? {|c| c === element }
|
15
15
|
end
|
16
16
|
end
|
17
17
|
end
|
@@ -1,18 +1,19 @@
|
|
1
1
|
module Konjak
|
2
2
|
class HtmlSegmentor < Segmentor
|
3
|
+
SEGMENTS_PATTERNS = [
|
4
|
+
%r{<(?<start>p|h1|h2|h3|h4|h5|h6|li|title|td)>(.*?)</\k<start>>}m,
|
5
|
+
%r{<(?<start>p|h1|h2|h3|h4|h5|h6|li|title|td) [^>]*?>(.*?)</\k<start>>}m,
|
6
|
+
%r{<div>(.*?)</div>}m,
|
7
|
+
%r{<div [^>]*?>(.*?)</div>}m
|
8
|
+
]
|
9
|
+
|
3
10
|
def segments
|
4
11
|
segments = [content.dup]
|
5
12
|
|
6
13
|
begin
|
7
14
|
size = segments.size
|
8
15
|
|
9
|
-
|
10
|
-
%r{<(?<start>p|h1|h2|h3|h4|h5|h6|li|title|td)>(.*?)</\k<start>>}m,
|
11
|
-
%r{<(?<start>p|h1|h2|h3|h4|h5|h6|li|title|td) [^>]*?>(.*?)</\k<start>>}m,
|
12
|
-
%r{<div>(.*?)</div>}m,
|
13
|
-
%r{<div [^>]*?>(.*?)</div>}m
|
14
|
-
]
|
15
|
-
segments_patterns.each do |pattern|
|
16
|
+
SEGMENTS_PATTERNS.each do |pattern|
|
16
17
|
segments.map! do |s|
|
17
18
|
s.partition(pattern)
|
18
19
|
end
|
data/lib/konjak/map.rb
CHANGED
data/lib/konjak/note.rb
CHANGED
@@ -1,17 +1,19 @@
|
|
1
1
|
module Konjak
|
2
2
|
class Note < StructuralElement
|
3
|
+
TAG_NAME = 'note'
|
4
|
+
|
3
5
|
# optional attrs
|
4
6
|
tmx_attr_accessor(:xml_lang, :'xml:lang')
|
5
7
|
tmx_attr_accessor(:o_encoding, :"o-encoding")
|
6
8
|
|
7
9
|
# childrens
|
8
10
|
def text
|
9
|
-
|
11
|
+
super
|
10
12
|
end
|
11
13
|
|
12
14
|
# methods
|
13
15
|
def can_contain?(element)
|
14
|
-
|
16
|
+
String === element
|
15
17
|
end
|
16
18
|
end
|
17
19
|
end
|
@@ -1,31 +1,33 @@
|
|
1
1
|
module Konjak
|
2
2
|
class PolytexSegmentor < Segmentor
|
3
|
+
|
4
|
+
SEGMENTS_PATTERNS = [
|
5
|
+
/\\begin\{(?<start>[^\}]+)\}([\n.]*?)\\end\{\k<start>\}/m,
|
6
|
+
/(?<=\\chapter\{)[^\}]+(?=\})/,
|
7
|
+
/(?<=\\section\{)[^\}]+(?=\})/,
|
8
|
+
/(?<=\\subsection\{)[^\}]+(?=\})/,
|
9
|
+
/\\footnote\{(?<gr>\\(?!footnote)[^\{]+\{[^\}]+\}(?:\{[^\}]+\})?\g<gr>|[^{])+\}/m,
|
10
|
+
/(?<=\\footnote\{)(?<gr>\\(?!footnote)[^\{]+\{[^\}]+\}(?:\{[^\}]+\})?\g<gr>|[^{])+(?=\})/m,
|
11
|
+
/(?<=\\codecaption\{).+(?= \\|\}$)/,
|
12
|
+
/(?<=\\caption\{).+(?=\\label\{.*\}\}$)/,
|
13
|
+
/(?<=\n)^.*$(?=\n)/m,
|
14
|
+
/# .*$/,
|
15
|
+
/(?<=^).+?[\.\?\!](?= |\n|\t)/,
|
16
|
+
/(?<=\()[^\.\n]+[\.\?\!](?=\))/,
|
17
|
+
/^ (?=[\w\\]+)/,
|
18
|
+
/^\s+% .*$/,
|
19
|
+
/^$/,
|
20
|
+
/\\noindent /,
|
21
|
+
/\\item /,
|
22
|
+
]
|
23
|
+
|
3
24
|
def segments
|
4
25
|
segments = [content.dup]
|
5
26
|
|
6
27
|
begin
|
7
28
|
size = segments.size
|
8
29
|
|
9
|
-
|
10
|
-
/\\begin\{(?<start>[^\}]+)\}([\n.]*?)\\end\{\k<start>\}/m,
|
11
|
-
/(?<=\\chapter\{)[^\}]+(?=\})/,
|
12
|
-
/(?<=\\section\{)[^\}]+(?=\})/,
|
13
|
-
/(?<=\\subsection\{)[^\}]+(?=\})/,
|
14
|
-
/\\footnote\{(?<gr>\\(?!footnote)[^\{]+\{[^\}]+\}(?:\{[^\}]+\})?\g<gr>|[^{])+\}/m,
|
15
|
-
/(?<=\\footnote\{)(?<gr>\\(?!footnote)[^\{]+\{[^\}]+\}(?:\{[^\}]+\})?\g<gr>|[^{])+(?=\})/m,
|
16
|
-
/(?<=\\codecaption\{).+(?= \\|\}$)/,
|
17
|
-
/(?<=\\caption\{).+(?=\\label\{.*\}\}$)/,
|
18
|
-
/(?<=\n)^.*$(?=\n)/m,
|
19
|
-
/# .*$/,
|
20
|
-
/(?<=^).+?[\.\?\!](?= |\n|\t)/,
|
21
|
-
/(?<=\()[^\.\n]+[\.\?\!](?=\))/,
|
22
|
-
/^ (?=[\w\\]+)/,
|
23
|
-
/^\s+% .*$/,
|
24
|
-
/^$/,
|
25
|
-
/\\noindent /,
|
26
|
-
/\\item /,
|
27
|
-
]
|
28
|
-
segments_patterns.each do |pattern|
|
30
|
+
SEGMENTS_PATTERNS.each do |pattern|
|
29
31
|
segments.map! do |s|
|
30
32
|
s.partition(pattern)
|
31
33
|
end
|
@@ -37,4 +39,5 @@ module Konjak
|
|
37
39
|
segments
|
38
40
|
end
|
39
41
|
end
|
42
|
+
|
40
43
|
end
|
data/lib/konjak/property.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
module Konjak
|
2
2
|
class Property < StructuralElement
|
3
|
+
TAG_NAME = 'prop'
|
4
|
+
|
3
5
|
# required attrs
|
4
6
|
tmx_attr_accessor(:type, required: true)
|
5
7
|
|
@@ -9,14 +11,14 @@ module Konjak
|
|
9
11
|
|
10
12
|
# childrens
|
11
13
|
def text
|
12
|
-
|
14
|
+
super
|
13
15
|
end
|
14
16
|
|
15
17
|
# methods
|
16
18
|
def can_contain?(element)
|
17
19
|
# FIXME
|
18
20
|
# Tool-specific data or text.
|
19
|
-
|
21
|
+
String === element
|
20
22
|
end
|
21
23
|
|
22
24
|
def unpublished?
|
data/lib/konjak/segment/gtt.rb
CHANGED
data/lib/konjak/segment.rb
CHANGED
@@ -4,26 +4,29 @@ require 'mem'
|
|
4
4
|
module Konjak
|
5
5
|
# container
|
6
6
|
class Segment < StructuralElement
|
7
|
+
TAG_NAME = 'seg'
|
8
|
+
WHITE_SPACE_PATTERN_TEXT = '\s'
|
9
|
+
POSSESSIVE_QUALIFIER = '++'
|
10
|
+
|
7
11
|
include GTT
|
8
12
|
include Mem
|
9
13
|
|
10
14
|
# children
|
11
15
|
def text
|
12
|
-
|
16
|
+
super
|
13
17
|
end
|
14
18
|
|
15
19
|
# methods
|
16
20
|
def can_contain?(element)
|
17
|
-
[
|
21
|
+
[String, BeginPairedTag, EndPairedTag, IsolatedTag, Placeholder, Highlight].any? {|c| c === element }
|
18
22
|
end
|
19
23
|
|
20
24
|
def compile_pattern
|
21
25
|
regexp = Regexp.escape(text)
|
22
|
-
regexp.gsub!(/(?<!^)\\\s/) {
|
23
|
-
regexp.gsub!(/(?<!^)(?:\\s)+(?!$)/) {|s| s +
|
26
|
+
regexp.gsub!(/(?<!^)\\\s/) { WHITE_SPACE_PATTERN_TEXT }
|
27
|
+
regexp.gsub!(/(?<!^)(?:\\s)+(?!$)/) {|s| s + POSSESSIVE_QUALIFIER }
|
24
28
|
Regexp.compile(regexp)
|
25
29
|
end
|
26
|
-
memoize :compile_pattern
|
27
30
|
|
28
31
|
def translation_unit
|
29
32
|
TranslationUnit.new(translation_unit_variant.parent)
|
data/lib/konjak/sub_flow.rb
CHANGED
@@ -12,7 +12,7 @@ module Konjak
|
|
12
12
|
# They can be in any order, except that each <bpt> element must have a subsequent corresponding <ept> element.
|
13
13
|
|
14
14
|
def can_contain?(element)
|
15
|
-
[
|
15
|
+
[String, BeginPairedTag, EndPairedTag, IsolatedTag, Placeholder, Hilight].any? {|c| c === element }
|
16
16
|
end
|
17
17
|
end
|
18
18
|
end
|
data/lib/konjak/tmx.rb
CHANGED
@@ -7,12 +7,12 @@ module Konjak
|
|
7
7
|
|
8
8
|
# required element
|
9
9
|
def header
|
10
|
-
Header.new(root.at_xpath(
|
10
|
+
Header.new(root.at_xpath(Header::TAG_NAME))
|
11
11
|
end
|
12
12
|
|
13
13
|
# required element
|
14
14
|
def body
|
15
|
-
Body.new(root.at_xpath(
|
15
|
+
Body.new(root.at_xpath(Body::TAG_NAME))
|
16
16
|
end
|
17
17
|
|
18
18
|
# FIXME
|
@@ -18,11 +18,13 @@ module Konjak
|
|
18
18
|
translation_units.each do |translation_unit|
|
19
19
|
segment = translation_unit.variant(@lang).segment
|
20
20
|
|
21
|
+
pat = compile_pattern(segment)
|
22
|
+
|
21
23
|
segments.map! {|text|
|
22
24
|
next text if text.length < min_segment_length
|
23
25
|
next text if text.is_a?(SegmentString)
|
24
26
|
|
25
|
-
split(segment, text)
|
27
|
+
split(pat, segment, text)
|
26
28
|
}.flatten!
|
27
29
|
end
|
28
30
|
segments
|
@@ -38,13 +40,16 @@ module Konjak
|
|
38
40
|
@options[:min_segment_length]
|
39
41
|
end
|
40
42
|
|
41
|
-
def split(segment, text)
|
43
|
+
def split(pat, segment, text)
|
42
44
|
texts = []
|
43
45
|
while true
|
44
46
|
break if text.length < min_segment_length
|
45
47
|
|
46
|
-
|
47
|
-
|
48
|
+
break unless text =~ pat
|
49
|
+
|
50
|
+
head = $`
|
51
|
+
match = $&
|
52
|
+
tail = $'
|
48
53
|
|
49
54
|
texts << head unless head.empty?
|
50
55
|
|
@@ -62,7 +67,6 @@ module Konjak
|
|
62
67
|
tu.variant(@lang).segment.text.length < min_segment_length
|
63
68
|
}
|
64
69
|
end
|
65
|
-
memoize :translation_units
|
66
70
|
end
|
67
71
|
end
|
68
72
|
end
|
@@ -1,5 +1,7 @@
|
|
1
1
|
module Konjak
|
2
2
|
class TranslationUnit < StructuralElement
|
3
|
+
TAG_NAME = 'tu'
|
4
|
+
|
3
5
|
# optional attrs
|
4
6
|
tmx_attr_accessor(:tuid)
|
5
7
|
tmx_attr_accessor(:o_encoding, :"o-encoding")
|
@@ -19,8 +21,9 @@ module Konjak
|
|
19
21
|
|
20
22
|
# childrens
|
21
23
|
def variants
|
22
|
-
children.select {|c| c.name ==
|
24
|
+
children.select {|c| c.name == TranslationUnitVariant::TAG_NAME }.map! {|tuv| TranslationUnitVariant.new(tuv) }
|
23
25
|
end
|
26
|
+
memoize :variants
|
24
27
|
|
25
28
|
# methods
|
26
29
|
def can_contain?(element)
|
@@ -1,5 +1,7 @@
|
|
1
1
|
module Konjak
|
2
2
|
class TranslationUnitVariant < StructuralElement
|
3
|
+
TAG_NAME = 'tuv'
|
4
|
+
|
3
5
|
# required attrs
|
4
6
|
tmx_attr_accessor(:xml_lang, :'xml:lang', required: true)
|
5
7
|
|
@@ -18,16 +20,17 @@ module Konjak
|
|
18
20
|
|
19
21
|
# childrens
|
20
22
|
def notes
|
21
|
-
children.select {|c| c.name == 'note' }.map {|n| Note.new(n) }
|
23
|
+
children.select {|c| c.name == 'note' }.map! {|n| Note.new(n) }
|
22
24
|
end
|
23
25
|
|
24
26
|
def properties
|
25
|
-
children.select {|c| c.name == 'prop' }.map {|n| Property.new(n) }
|
27
|
+
children.select {|c| c.name == 'prop' }.map! {|n| Property.new(n) }
|
26
28
|
end
|
27
29
|
|
28
30
|
def segment
|
29
|
-
Segment.new(children.detect {|c| c.name ==
|
31
|
+
Segment.new(children.detect {|c| c.name == Segment::TAG_NAME })
|
30
32
|
end
|
33
|
+
memoize :segment
|
31
34
|
|
32
35
|
# methods
|
33
36
|
|
@@ -1,5 +1,7 @@
|
|
1
1
|
module Konjak
|
2
2
|
class UserDefinedEncoding < StructuralElement
|
3
|
+
TAG_NAME = 'ude'
|
4
|
+
|
3
5
|
# required attrs
|
4
6
|
tmx_attr_accessor(:name, required: true)
|
5
7
|
|
@@ -10,7 +12,7 @@ module Konjak
|
|
10
12
|
|
11
13
|
# childrens
|
12
14
|
def maps
|
13
|
-
children.select {|c| c.name ==
|
15
|
+
children.select {|c| c.name == Map::TAG_NAME }.map! {|n| Map.new(n) }
|
14
16
|
end
|
15
17
|
|
16
18
|
# methods
|
data/lib/konjak/version.rb
CHANGED
data/lib/konjak.rb
CHANGED
data/spec/konjak_parse_spec.rb
CHANGED
@@ -38,7 +38,7 @@ describe Konjak do
|
|
38
38
|
|
39
39
|
its(:xml_lang) { is_expected.to eq 'en' }
|
40
40
|
its(:o_encoding) { is_expected.to eq 'iso-8859-1' }
|
41
|
-
its(:text) { is_expected.to be_instance_of
|
41
|
+
its(:text) { is_expected.to be_instance_of String }
|
42
42
|
|
43
43
|
describe 'text' do
|
44
44
|
subject { super().text }
|
@@ -90,7 +90,7 @@ describe Konjak do
|
|
90
90
|
its(:xml_lang) { is_expected.to eq 'en' }
|
91
91
|
its(:o_encoding) { is_expected.to eq 'iso-8859-1' }
|
92
92
|
its(:type) { is_expected.to eq 'RTFPreamble' }
|
93
|
-
its(:text) { is_expected.to be_instance_of
|
93
|
+
its(:text) { is_expected.to be_instance_of String }
|
94
94
|
|
95
95
|
describe '.text' do
|
96
96
|
subject { super().text }
|
@@ -141,7 +141,7 @@ describe Konjak do
|
|
141
141
|
describe '.segment' do
|
142
142
|
subject { super().segment }
|
143
143
|
|
144
|
-
its(:text) { is_expected.to be_instance_of
|
144
|
+
its(:text) { is_expected.to be_instance_of String }
|
145
145
|
|
146
146
|
describe '.text' do
|
147
147
|
subject { super().text }
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: konjak
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.15
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Seiei Higa
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-08-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mem
|
@@ -161,7 +161,6 @@ files:
|
|
161
161
|
- lib/konjak/segmentor.rb
|
162
162
|
- lib/konjak/structural_element.rb
|
163
163
|
- lib/konjak/sub_flow.rb
|
164
|
-
- lib/konjak/text.rb
|
165
164
|
- lib/konjak/tmx.rb
|
166
165
|
- lib/konjak/tmx_segmentor.rb
|
167
166
|
- lib/konjak/tmx_segmentor/gtt_html_strategy.rb
|
@@ -216,4 +215,3 @@ test_files:
|
|
216
215
|
- spec/konjak_translate_spec.rb
|
217
216
|
- spec/spec_helper.rb
|
218
217
|
- spec/support/equal_xml_matcher.rb
|
219
|
-
has_rdoc:
|
data/lib/konjak/text.rb
DELETED