konjak 0.0.14 → 0.0.15
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/konjak/body.rb +3 -1
- data/lib/konjak/element.rb +3 -0
- data/lib/konjak/header.rb +5 -3
- data/lib/konjak/highlight.rb +1 -1
- data/lib/konjak/html_segmentor.rb +8 -7
- data/lib/konjak/map.rb +2 -0
- data/lib/konjak/note.rb +4 -2
- data/lib/konjak/polytex_segmentor.rb +23 -20
- data/lib/konjak/property.rb +4 -2
- data/lib/konjak/segment/gtt.rb +0 -1
- data/lib/konjak/segment.rb +8 -5
- data/lib/konjak/sub_flow.rb +1 -1
- data/lib/konjak/tmx.rb +2 -2
- data/lib/konjak/tmx_segmentor/strategy.rb +9 -5
- data/lib/konjak/translation_unit.rb +4 -1
- data/lib/konjak/translation_unit_variant.rb +6 -3
- data/lib/konjak/user_defined_encoding.rb +3 -1
- data/lib/konjak/version.rb +1 -1
- data/lib/konjak.rb +0 -1
- data/spec/konjak_parse_spec.rb +3 -3
- metadata +2 -4
- data/lib/konjak/text.rb +0 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 219a842656b4743ac6f782fdca2167dcf85e51ab
|
4
|
+
data.tar.gz: 2a1c74e8771791b5ac9dab000a7d556f88a14335
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f26891f463c259af8eee10698208a6df50c92b49c5b0f0a5065feadcccd10fe45733b8598eec5e47d35286feb519156aa8b82600576eeccd2d6cf39ecaf29daf
|
7
|
+
data.tar.gz: c88fe8b6fdbab45f4f5ddcc8c851869e47b62fc7bfc800676dba0f63352e193ffb453b15f95244eb364398b5c841629f2ce2d9df5b9edeca2a7b23e7c5c8caad
|
data/lib/konjak/body.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
module Konjak
|
2
2
|
class Body < StructuralElement
|
3
|
+
TAG_NAME = 'body'
|
4
|
+
|
3
5
|
# childrens
|
4
6
|
def translation_units
|
5
|
-
children.select {|c| c.name ==
|
7
|
+
children.select {|c| c.name == TranslationUnit::TAG_NAME }.map! {|tu| TranslationUnit.new(tu) }
|
6
8
|
end
|
7
9
|
|
8
10
|
# methods
|
data/lib/konjak/element.rb
CHANGED
data/lib/konjak/header.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
module Konjak
|
2
2
|
class Header < StructuralElement
|
3
|
+
TAG_NAME = 'header'
|
4
|
+
|
3
5
|
# required attrs
|
4
6
|
tmx_attr_accessor(:creation_tool, :creationtool, required: true)
|
5
7
|
tmx_attr_accessor(:creation_tool_version, :creationtoolversion, required: true)
|
@@ -18,15 +20,15 @@ module Konjak
|
|
18
20
|
|
19
21
|
# childrens
|
20
22
|
def notes
|
21
|
-
children.select {|c| c.name ==
|
23
|
+
children.select {|c| c.name == Note::TAG_NAME }.map! {|n| Note.new(n) }
|
22
24
|
end
|
23
25
|
|
24
26
|
def user_defined_encodings
|
25
|
-
children.select {|c| c.name ==
|
27
|
+
children.select {|c| c.name == UserDefinedEncoding::TAG_NAME }.map! {|n| UserDefinedEncoding.new(n) }
|
26
28
|
end
|
27
29
|
|
28
30
|
def properties
|
29
|
-
children.select {|c| c.name ==
|
31
|
+
children.select {|c| c.name == Property::TAG_NAME }.map! {|n| Property.new(n) }
|
30
32
|
end
|
31
33
|
|
32
34
|
# methods
|
data/lib/konjak/highlight.rb
CHANGED
@@ -11,7 +11,7 @@ module Konjak
|
|
11
11
|
# Zero, one or more of the following elements: <bpt>, <ept>, <it>, <ph>, and <hi>.
|
12
12
|
# They can be in any order, except that each <bpt> element must have a subsequent corresponding <ept> element.
|
13
13
|
def can_contain?(element)
|
14
|
-
[
|
14
|
+
[String, BeginPairedTag, EndPairedTag, IsolatedTag, Placeholder, Hilight].any? {|c| c === element }
|
15
15
|
end
|
16
16
|
end
|
17
17
|
end
|
@@ -1,18 +1,19 @@
|
|
1
1
|
module Konjak
|
2
2
|
class HtmlSegmentor < Segmentor
|
3
|
+
SEGMENTS_PATTERNS = [
|
4
|
+
%r{<(?<start>p|h1|h2|h3|h4|h5|h6|li|title|td)>(.*?)</\k<start>>}m,
|
5
|
+
%r{<(?<start>p|h1|h2|h3|h4|h5|h6|li|title|td) [^>]*?>(.*?)</\k<start>>}m,
|
6
|
+
%r{<div>(.*?)</div>}m,
|
7
|
+
%r{<div [^>]*?>(.*?)</div>}m
|
8
|
+
]
|
9
|
+
|
3
10
|
def segments
|
4
11
|
segments = [content.dup]
|
5
12
|
|
6
13
|
begin
|
7
14
|
size = segments.size
|
8
15
|
|
9
|
-
|
10
|
-
%r{<(?<start>p|h1|h2|h3|h4|h5|h6|li|title|td)>(.*?)</\k<start>>}m,
|
11
|
-
%r{<(?<start>p|h1|h2|h3|h4|h5|h6|li|title|td) [^>]*?>(.*?)</\k<start>>}m,
|
12
|
-
%r{<div>(.*?)</div>}m,
|
13
|
-
%r{<div [^>]*?>(.*?)</div>}m
|
14
|
-
]
|
15
|
-
segments_patterns.each do |pattern|
|
16
|
+
SEGMENTS_PATTERNS.each do |pattern|
|
16
17
|
segments.map! do |s|
|
17
18
|
s.partition(pattern)
|
18
19
|
end
|
data/lib/konjak/map.rb
CHANGED
data/lib/konjak/note.rb
CHANGED
@@ -1,17 +1,19 @@
|
|
1
1
|
module Konjak
|
2
2
|
class Note < StructuralElement
|
3
|
+
TAG_NAME = 'note'
|
4
|
+
|
3
5
|
# optional attrs
|
4
6
|
tmx_attr_accessor(:xml_lang, :'xml:lang')
|
5
7
|
tmx_attr_accessor(:o_encoding, :"o-encoding")
|
6
8
|
|
7
9
|
# childrens
|
8
10
|
def text
|
9
|
-
|
11
|
+
super
|
10
12
|
end
|
11
13
|
|
12
14
|
# methods
|
13
15
|
def can_contain?(element)
|
14
|
-
|
16
|
+
String === element
|
15
17
|
end
|
16
18
|
end
|
17
19
|
end
|
@@ -1,31 +1,33 @@
|
|
1
1
|
module Konjak
|
2
2
|
class PolytexSegmentor < Segmentor
|
3
|
+
|
4
|
+
SEGMENTS_PATTERNS = [
|
5
|
+
/\\begin\{(?<start>[^\}]+)\}([\n.]*?)\\end\{\k<start>\}/m,
|
6
|
+
/(?<=\\chapter\{)[^\}]+(?=\})/,
|
7
|
+
/(?<=\\section\{)[^\}]+(?=\})/,
|
8
|
+
/(?<=\\subsection\{)[^\}]+(?=\})/,
|
9
|
+
/\\footnote\{(?<gr>\\(?!footnote)[^\{]+\{[^\}]+\}(?:\{[^\}]+\})?\g<gr>|[^{])+\}/m,
|
10
|
+
/(?<=\\footnote\{)(?<gr>\\(?!footnote)[^\{]+\{[^\}]+\}(?:\{[^\}]+\})?\g<gr>|[^{])+(?=\})/m,
|
11
|
+
/(?<=\\codecaption\{).+(?= \\|\}$)/,
|
12
|
+
/(?<=\\caption\{).+(?=\\label\{.*\}\}$)/,
|
13
|
+
/(?<=\n)^.*$(?=\n)/m,
|
14
|
+
/# .*$/,
|
15
|
+
/(?<=^).+?[\.\?\!](?= |\n|\t)/,
|
16
|
+
/(?<=\()[^\.\n]+[\.\?\!](?=\))/,
|
17
|
+
/^ (?=[\w\\]+)/,
|
18
|
+
/^\s+% .*$/,
|
19
|
+
/^$/,
|
20
|
+
/\\noindent /,
|
21
|
+
/\\item /,
|
22
|
+
]
|
23
|
+
|
3
24
|
def segments
|
4
25
|
segments = [content.dup]
|
5
26
|
|
6
27
|
begin
|
7
28
|
size = segments.size
|
8
29
|
|
9
|
-
|
10
|
-
/\\begin\{(?<start>[^\}]+)\}([\n.]*?)\\end\{\k<start>\}/m,
|
11
|
-
/(?<=\\chapter\{)[^\}]+(?=\})/,
|
12
|
-
/(?<=\\section\{)[^\}]+(?=\})/,
|
13
|
-
/(?<=\\subsection\{)[^\}]+(?=\})/,
|
14
|
-
/\\footnote\{(?<gr>\\(?!footnote)[^\{]+\{[^\}]+\}(?:\{[^\}]+\})?\g<gr>|[^{])+\}/m,
|
15
|
-
/(?<=\\footnote\{)(?<gr>\\(?!footnote)[^\{]+\{[^\}]+\}(?:\{[^\}]+\})?\g<gr>|[^{])+(?=\})/m,
|
16
|
-
/(?<=\\codecaption\{).+(?= \\|\}$)/,
|
17
|
-
/(?<=\\caption\{).+(?=\\label\{.*\}\}$)/,
|
18
|
-
/(?<=\n)^.*$(?=\n)/m,
|
19
|
-
/# .*$/,
|
20
|
-
/(?<=^).+?[\.\?\!](?= |\n|\t)/,
|
21
|
-
/(?<=\()[^\.\n]+[\.\?\!](?=\))/,
|
22
|
-
/^ (?=[\w\\]+)/,
|
23
|
-
/^\s+% .*$/,
|
24
|
-
/^$/,
|
25
|
-
/\\noindent /,
|
26
|
-
/\\item /,
|
27
|
-
]
|
28
|
-
segments_patterns.each do |pattern|
|
30
|
+
SEGMENTS_PATTERNS.each do |pattern|
|
29
31
|
segments.map! do |s|
|
30
32
|
s.partition(pattern)
|
31
33
|
end
|
@@ -37,4 +39,5 @@ module Konjak
|
|
37
39
|
segments
|
38
40
|
end
|
39
41
|
end
|
42
|
+
|
40
43
|
end
|
data/lib/konjak/property.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
module Konjak
|
2
2
|
class Property < StructuralElement
|
3
|
+
TAG_NAME = 'prop'
|
4
|
+
|
3
5
|
# required attrs
|
4
6
|
tmx_attr_accessor(:type, required: true)
|
5
7
|
|
@@ -9,14 +11,14 @@ module Konjak
|
|
9
11
|
|
10
12
|
# childrens
|
11
13
|
def text
|
12
|
-
|
14
|
+
super
|
13
15
|
end
|
14
16
|
|
15
17
|
# methods
|
16
18
|
def can_contain?(element)
|
17
19
|
# FIXME
|
18
20
|
# Tool-specific data or text.
|
19
|
-
|
21
|
+
String === element
|
20
22
|
end
|
21
23
|
|
22
24
|
def unpublished?
|
data/lib/konjak/segment/gtt.rb
CHANGED
data/lib/konjak/segment.rb
CHANGED
@@ -4,26 +4,29 @@ require 'mem'
|
|
4
4
|
module Konjak
|
5
5
|
# container
|
6
6
|
class Segment < StructuralElement
|
7
|
+
TAG_NAME = 'seg'
|
8
|
+
WHITE_SPACE_PATTERN_TEXT = '\s'
|
9
|
+
POSSESSIVE_QUALIFIER = '++'
|
10
|
+
|
7
11
|
include GTT
|
8
12
|
include Mem
|
9
13
|
|
10
14
|
# children
|
11
15
|
def text
|
12
|
-
|
16
|
+
super
|
13
17
|
end
|
14
18
|
|
15
19
|
# methods
|
16
20
|
def can_contain?(element)
|
17
|
-
[
|
21
|
+
[String, BeginPairedTag, EndPairedTag, IsolatedTag, Placeholder, Highlight].any? {|c| c === element }
|
18
22
|
end
|
19
23
|
|
20
24
|
def compile_pattern
|
21
25
|
regexp = Regexp.escape(text)
|
22
|
-
regexp.gsub!(/(?<!^)\\\s/) {
|
23
|
-
regexp.gsub!(/(?<!^)(?:\\s)+(?!$)/) {|s| s +
|
26
|
+
regexp.gsub!(/(?<!^)\\\s/) { WHITE_SPACE_PATTERN_TEXT }
|
27
|
+
regexp.gsub!(/(?<!^)(?:\\s)+(?!$)/) {|s| s + POSSESSIVE_QUALIFIER }
|
24
28
|
Regexp.compile(regexp)
|
25
29
|
end
|
26
|
-
memoize :compile_pattern
|
27
30
|
|
28
31
|
def translation_unit
|
29
32
|
TranslationUnit.new(translation_unit_variant.parent)
|
data/lib/konjak/sub_flow.rb
CHANGED
@@ -12,7 +12,7 @@ module Konjak
|
|
12
12
|
# They can be in any order, except that each <bpt> element must have a subsequent corresponding <ept> element.
|
13
13
|
|
14
14
|
def can_contain?(element)
|
15
|
-
[
|
15
|
+
[String, BeginPairedTag, EndPairedTag, IsolatedTag, Placeholder, Hilight].any? {|c| c === element }
|
16
16
|
end
|
17
17
|
end
|
18
18
|
end
|
data/lib/konjak/tmx.rb
CHANGED
@@ -7,12 +7,12 @@ module Konjak
|
|
7
7
|
|
8
8
|
# required element
|
9
9
|
def header
|
10
|
-
Header.new(root.at_xpath(
|
10
|
+
Header.new(root.at_xpath(Header::TAG_NAME))
|
11
11
|
end
|
12
12
|
|
13
13
|
# required element
|
14
14
|
def body
|
15
|
-
Body.new(root.at_xpath(
|
15
|
+
Body.new(root.at_xpath(Body::TAG_NAME))
|
16
16
|
end
|
17
17
|
|
18
18
|
# FIXME
|
@@ -18,11 +18,13 @@ module Konjak
|
|
18
18
|
translation_units.each do |translation_unit|
|
19
19
|
segment = translation_unit.variant(@lang).segment
|
20
20
|
|
21
|
+
pat = compile_pattern(segment)
|
22
|
+
|
21
23
|
segments.map! {|text|
|
22
24
|
next text if text.length < min_segment_length
|
23
25
|
next text if text.is_a?(SegmentString)
|
24
26
|
|
25
|
-
split(segment, text)
|
27
|
+
split(pat, segment, text)
|
26
28
|
}.flatten!
|
27
29
|
end
|
28
30
|
segments
|
@@ -38,13 +40,16 @@ module Konjak
|
|
38
40
|
@options[:min_segment_length]
|
39
41
|
end
|
40
42
|
|
41
|
-
def split(segment, text)
|
43
|
+
def split(pat, segment, text)
|
42
44
|
texts = []
|
43
45
|
while true
|
44
46
|
break if text.length < min_segment_length
|
45
47
|
|
46
|
-
|
47
|
-
|
48
|
+
break unless text =~ pat
|
49
|
+
|
50
|
+
head = $`
|
51
|
+
match = $&
|
52
|
+
tail = $'
|
48
53
|
|
49
54
|
texts << head unless head.empty?
|
50
55
|
|
@@ -62,7 +67,6 @@ module Konjak
|
|
62
67
|
tu.variant(@lang).segment.text.length < min_segment_length
|
63
68
|
}
|
64
69
|
end
|
65
|
-
memoize :translation_units
|
66
70
|
end
|
67
71
|
end
|
68
72
|
end
|
@@ -1,5 +1,7 @@
|
|
1
1
|
module Konjak
|
2
2
|
class TranslationUnit < StructuralElement
|
3
|
+
TAG_NAME = 'tu'
|
4
|
+
|
3
5
|
# optional attrs
|
4
6
|
tmx_attr_accessor(:tuid)
|
5
7
|
tmx_attr_accessor(:o_encoding, :"o-encoding")
|
@@ -19,8 +21,9 @@ module Konjak
|
|
19
21
|
|
20
22
|
# childrens
|
21
23
|
def variants
|
22
|
-
children.select {|c| c.name ==
|
24
|
+
children.select {|c| c.name == TranslationUnitVariant::TAG_NAME }.map! {|tuv| TranslationUnitVariant.new(tuv) }
|
23
25
|
end
|
26
|
+
memoize :variants
|
24
27
|
|
25
28
|
# methods
|
26
29
|
def can_contain?(element)
|
@@ -1,5 +1,7 @@
|
|
1
1
|
module Konjak
|
2
2
|
class TranslationUnitVariant < StructuralElement
|
3
|
+
TAG_NAME = 'tuv'
|
4
|
+
|
3
5
|
# required attrs
|
4
6
|
tmx_attr_accessor(:xml_lang, :'xml:lang', required: true)
|
5
7
|
|
@@ -18,16 +20,17 @@ module Konjak
|
|
18
20
|
|
19
21
|
# childrens
|
20
22
|
def notes
|
21
|
-
children.select {|c| c.name == 'note' }.map {|n| Note.new(n) }
|
23
|
+
children.select {|c| c.name == 'note' }.map! {|n| Note.new(n) }
|
22
24
|
end
|
23
25
|
|
24
26
|
def properties
|
25
|
-
children.select {|c| c.name == 'prop' }.map {|n| Property.new(n) }
|
27
|
+
children.select {|c| c.name == 'prop' }.map! {|n| Property.new(n) }
|
26
28
|
end
|
27
29
|
|
28
30
|
def segment
|
29
|
-
Segment.new(children.detect {|c| c.name ==
|
31
|
+
Segment.new(children.detect {|c| c.name == Segment::TAG_NAME })
|
30
32
|
end
|
33
|
+
memoize :segment
|
31
34
|
|
32
35
|
# methods
|
33
36
|
|
@@ -1,5 +1,7 @@
|
|
1
1
|
module Konjak
|
2
2
|
class UserDefinedEncoding < StructuralElement
|
3
|
+
TAG_NAME = 'ude'
|
4
|
+
|
3
5
|
# required attrs
|
4
6
|
tmx_attr_accessor(:name, required: true)
|
5
7
|
|
@@ -10,7 +12,7 @@ module Konjak
|
|
10
12
|
|
11
13
|
# childrens
|
12
14
|
def maps
|
13
|
-
children.select {|c| c.name ==
|
15
|
+
children.select {|c| c.name == Map::TAG_NAME }.map! {|n| Map.new(n) }
|
14
16
|
end
|
15
17
|
|
16
18
|
# methods
|
data/lib/konjak/version.rb
CHANGED
data/lib/konjak.rb
CHANGED
data/spec/konjak_parse_spec.rb
CHANGED
@@ -38,7 +38,7 @@ describe Konjak do
|
|
38
38
|
|
39
39
|
its(:xml_lang) { is_expected.to eq 'en' }
|
40
40
|
its(:o_encoding) { is_expected.to eq 'iso-8859-1' }
|
41
|
-
its(:text) { is_expected.to be_instance_of
|
41
|
+
its(:text) { is_expected.to be_instance_of String }
|
42
42
|
|
43
43
|
describe 'text' do
|
44
44
|
subject { super().text }
|
@@ -90,7 +90,7 @@ describe Konjak do
|
|
90
90
|
its(:xml_lang) { is_expected.to eq 'en' }
|
91
91
|
its(:o_encoding) { is_expected.to eq 'iso-8859-1' }
|
92
92
|
its(:type) { is_expected.to eq 'RTFPreamble' }
|
93
|
-
its(:text) { is_expected.to be_instance_of
|
93
|
+
its(:text) { is_expected.to be_instance_of String }
|
94
94
|
|
95
95
|
describe '.text' do
|
96
96
|
subject { super().text }
|
@@ -141,7 +141,7 @@ describe Konjak do
|
|
141
141
|
describe '.segment' do
|
142
142
|
subject { super().segment }
|
143
143
|
|
144
|
-
its(:text) { is_expected.to be_instance_of
|
144
|
+
its(:text) { is_expected.to be_instance_of String }
|
145
145
|
|
146
146
|
describe '.text' do
|
147
147
|
subject { super().text }
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: konjak
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.15
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Seiei Higa
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-08-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mem
|
@@ -161,7 +161,6 @@ files:
|
|
161
161
|
- lib/konjak/segmentor.rb
|
162
162
|
- lib/konjak/structural_element.rb
|
163
163
|
- lib/konjak/sub_flow.rb
|
164
|
-
- lib/konjak/text.rb
|
165
164
|
- lib/konjak/tmx.rb
|
166
165
|
- lib/konjak/tmx_segmentor.rb
|
167
166
|
- lib/konjak/tmx_segmentor/gtt_html_strategy.rb
|
@@ -216,4 +215,3 @@ test_files:
|
|
216
215
|
- spec/konjak_translate_spec.rb
|
217
216
|
- spec/spec_helper.rb
|
218
217
|
- spec/support/equal_xml_matcher.rb
|
219
|
-
has_rdoc:
|
data/lib/konjak/text.rb
DELETED