konjak 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ea6b25de9fe1bb666b02b8db7a3253619fe6cc5b
4
- data.tar.gz: 4bf4b8cdc708b5904c16e8d3b656c37b15c26f4e
3
+ metadata.gz: 0150635f9f3db7b2d052aee30b1db085162eceaa
4
+ data.tar.gz: f2a5b91c9dc7a9a99d84c9ac979a67893959ab56
5
5
  SHA512:
6
- metadata.gz: 94607b5df832840753b9a863dec409c98a283d9be758f40be2af1522cdd0815fe7615390f868fd2afa89133046a9418de25b1c568146cc5f3ceb491854c423c6
7
- data.tar.gz: bef5ef91e252bfadde9f6b8f6cb8fd6674efd4ae4e2d4c28a4f65e88c9a0330e365faf7fe4ced678a56a876f185dd8dac0d87f3877ef4d13f7d12be4939a94f1
6
+ metadata.gz: add182fac41dada9dce73a25f1a7b90ca1663fb4e52c0ae836e190cbd0bc3795356917b7f88918af1a61c8d2f7dc73ca09c6a268398f5c6f3a78d1c62a18d365
7
+ data.tar.gz: 21a97c4060b425a7ebfc9b3df5b6edd4de26d9c2ec874df619155257067f844b70a7efc71707cb1bb89a53b9a64015ff0aee739256b4df9acbf1a5062f85d873
@@ -1,6 +1,6 @@
1
- Copyright (c) 2014 Seiei Higa
1
+ Copyright © 2014-2015 [YassLab](http://yasslab.jp)
2
2
 
3
- MIT License
3
+ The MIT License
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining
6
6
  a copy of this software and associated documentation files (the
data/README.md CHANGED
@@ -31,3 +31,9 @@ Or install it yourself as:
31
31
  3. Commit your changes (`git commit -am 'Add some feature'`)
32
32
  4. Push to the branch (`git push origin my-new-feature`)
33
33
  5. Create new Pull Request
34
+
35
+ ## License
36
+
37
+ Copyright © 2015 [YassLab](http://yasslab.jp)
38
+
39
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
@@ -0,0 +1,27 @@
1
+ module Konjak
2
+ class HtmlSegmentor < Segmentor
3
+ def segments
4
+ segments = [content.dup]
5
+
6
+ begin
7
+ size = segments.size
8
+
9
+ segments_patterns = [
10
+ %r{<(?<start>p|h1|h2|h3|h4|h5|h6|li|title|td)>(.*?)</\k<start>>}m,
11
+ %r{<(?<start>p|h1|h2|h3|h4|h5|h6|li|title|td) [^>]*?>(.*?)</\k<start>>}m,
12
+ %r{<div>(.*?)</div>}m,
13
+ %r{<div [^>]*?>(.*?)</div>}m
14
+ ]
15
+ segments_patterns.each do |pattern|
16
+ segments.map! do |s|
17
+ s.partition(pattern)
18
+ end
19
+ segments.flatten!
20
+ segments.reject!(&:empty?)
21
+ end
22
+ end while segments.size != size
23
+
24
+ segments
25
+ end
26
+ end
27
+ end
data/lib/konjak/parser.rb CHANGED
@@ -2,7 +2,13 @@ require 'nokogiri'
2
2
 
3
3
  module Konjak
4
4
  class Parser
5
- def parse(xml, gtt: false)
5
+ attr_accessor :gtt
6
+
7
+ def initialize(gtt: false)
8
+ @gtt = gtt
9
+ end
10
+
11
+ def parse(xml)
6
12
  if gtt
7
13
  # FIXME
8
14
  xml = xml.gsub(/&amp;(#\d+|#x[0-9a-fA-F]+|[0-9a-zA-Z]+);/) { "&#{$1};" }
@@ -0,0 +1,40 @@
1
+ module Konjak
2
+ class PolytexSegmentor < Segmentor
3
+ def segments
4
+ segments = [content.dup]
5
+
6
+ begin
7
+ size = segments.size
8
+
9
+ segments_patterns = [
10
+ /\\begin\{(?<start>[^\}]+)\}([\n.]*?)\\end\{\k<start>\}/m,
11
+ /(?<=\\chapter\{)[^\}]+(?=\})/,
12
+ /(?<=\\section\{)[^\}]+(?=\})/,
13
+ /(?<=\\subsection\{)[^\}]+(?=\})/,
14
+ /\\footnote\{(?<gr>\\(?!footnote)[^\{]+\{[^\}]+\}(?:\{[^\}]+\})?\g<gr>|[^{])+\}/m,
15
+ /(?<=\\footnote\{)(?<gr>\\(?!footnote)[^\{]+\{[^\}]+\}(?:\{[^\}]+\})?\g<gr>|[^{])+(?=\})/m,
16
+ /(?<=\\codecaption\{).+(?= \\|\}$)/,
17
+ /(?<=\\caption\{).+(?=\\label\{.*\}\}$)/,
18
+ /(?<=\n)^.*$(?=\n)/m,
19
+ /# .*$/,
20
+ /(?<=^).+?[\.\?\!](?= |\n|\t)/,
21
+ /(?<=\()[^\.\n]+[\.\?\!](?=\))/,
22
+ /^ (?=[\w\\]+)/,
23
+ /^\s+% .*$/,
24
+ /^$/,
25
+ /\\noindent /,
26
+ /\\item /,
27
+ ]
28
+ segments_patterns.each do |pattern|
29
+ segments.map! do |s|
30
+ s.partition(pattern)
31
+ end
32
+ segments.flatten!
33
+ segments.reject!(&:empty?)
34
+ end
35
+ end while segments.size != size
36
+
37
+ segments
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,39 @@
1
+ module Konjak
2
+ class Segment < StructuralElement
3
+ module GTT
4
+ Tag = Struct.new(:gtt, :html)
5
+
6
+ def compile_gtt_html_pattern
7
+ regexp = Regexp.escape(text)
8
+ gtt_tag_ns.each do |n|
9
+ regexp = regexp.sub(/\\\{#{n}\\\}/) { "(?<n#{n}><(?<_#{n}>\\w+)[^>]*>)" }
10
+ regexp = regexp.gsub(/\\\{#{n}\\\}/) { "\\k<n#{n}>" }
11
+ regexp = regexp.gsub(/\\\{\/#{n}\\\}/) { "</\\k<_#{n}>>" }
12
+ end
13
+ Regexp.compile(regexp)
14
+ end
15
+
16
+ def extract_gtt_tags_from(text)
17
+ m = text.match(compile_gtt_html_pattern)
18
+ gtt_tag_ns.each_with_object([]) do |n, tags|
19
+ tags << Tag.new("{#{n}}", m["n#{n}"])
20
+ tags << Tag.new("{/#{n}}", "</#{m["_#{n}"]}>")
21
+ end
22
+ end
23
+
24
+ def interpolate_gtt_tags(tags)
25
+ new_text = self.text.dup
26
+ tags.each do |tag|
27
+ new_text = new_text.gsub(tag[:gtt], tag[:html])
28
+ end
29
+ new_text
30
+ end
31
+
32
+ private
33
+
34
+ def gtt_tag_ns
35
+ text.scan(/\{(\d+)\}/).flatten.uniq
36
+ end
37
+ end
38
+ end
39
+ end
@@ -1,6 +1,10 @@
1
+ require 'konjak/segment/gtt'
2
+
1
3
  module Konjak
2
4
  # container
3
5
  class Segment < StructuralElement
6
+ include GTT
7
+
4
8
  # children
5
9
  def text
6
10
  Text.new(super)
@@ -10,5 +14,13 @@ module Konjak
10
14
  def can_contain?(element)
11
15
  [Text, BeginPairedTag, EndPairedTag, IsolatedTag, Placeholder, Highlight].any? {|c| c === element }
12
16
  end
17
+
18
+ def translation_unit
19
+ TranslationUnit.new(translation_unit_variant.parent)
20
+ end
21
+
22
+ def translation_unit_variant
23
+ TranslationUnitVariant.new(parent)
24
+ end
13
25
  end
14
26
  end
@@ -0,0 +1,10 @@
1
+ module Konjak
2
+ class Segmentor
3
+ attr_accessor :content, :options
4
+
5
+ def initialize(content, options)
6
+ @content = content
7
+ @options = options
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,28 @@
1
+ require 'konjak/tmx_segmentor/segment_string'
2
+ require 'konjak/tmx_segmentor/strategy'
3
+
4
+ module Konjak
5
+ class TmxSegmentor < Segmentor
6
+ class GttHtmlStrategy < Strategy
7
+
8
+ private
9
+
10
+ def split(translation_unit, text)
11
+ segment = translation_unit.variant(@lang).segment
12
+ pattern = segment.compile_gtt_html_pattern
13
+
14
+ texts = []
15
+ while true
16
+ head, match, tail = text.partition(pattern)
17
+ break if match.empty?
18
+ texts << head unless head.empty?
19
+
20
+ texts << SegmentString.new(match, segment)
21
+
22
+ text = tail
23
+ end
24
+ texts << text
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,14 @@
1
+ require 'konjak/segmentor'
2
+
3
+ module Konjak
4
+ class TmxSegmentor < Segmentor
5
+ class SegmentString < String
6
+ attr_accessor :segment
7
+
8
+ def initialize(str, segment)
9
+ super(str)
10
+ @segment = segment
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,37 @@
1
+ require 'mem'
2
+ require 'konjak/segmentor'
3
+ require 'konjak/tmx_segmentor/segment_string'
4
+
5
+ module Konjak
6
+ class TmxSegmentor < Segmentor
7
+ class Strategy
8
+ include Mem
9
+
10
+ def initialize(tmx, lang)
11
+ @tmx = tmx
12
+ @lang = lang
13
+ end
14
+
15
+ def segmentize(text)
16
+ segments = [text]
17
+ translation_units.each do |translation_unit|
18
+ segments.map! {|text|
19
+ next text if text.is_a?(SegmentString)
20
+
21
+ split(translation_unit, text)
22
+ }.flatten!
23
+ end
24
+ segments
25
+ end
26
+
27
+ private
28
+
29
+ def translation_units
30
+ @tmx.body.translation_units.sort_by {|tu|
31
+ -tu.variant(@lang).segment.text.length
32
+ }
33
+ end
34
+ memoize :translation_units
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,28 @@
1
+ require 'konjak/tmx_segmentor/segment_string'
2
+ require 'konjak/tmx_segmentor/strategy'
3
+
4
+ module Konjak
5
+ class TmxSegmentor < Segmentor
6
+ class TextStrategy < Strategy
7
+
8
+ private
9
+
10
+ def split(translation_unit, text)
11
+ segment = translation_unit.variant(@lang).segment
12
+ segment_text = segment.text
13
+
14
+ texts = []
15
+ while true
16
+ head, match, tail = text.partition(segment_text)
17
+ break if match.empty?
18
+ texts << head unless head.empty?
19
+
20
+ texts << SegmentString.new(match, segment)
21
+
22
+ text = tail
23
+ end
24
+ texts << text
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,40 @@
1
+ require 'konjak/segmentor'
2
+ require 'konjak/tmx_segmentor/gtt_html_strategy'
3
+ require 'konjak/tmx_segmentor/text_strategy'
4
+
5
+ module Konjak
6
+ class TmxSegmentor < Segmentor
7
+
8
+ STRATEGIES = {
9
+ text: TextStrategy,
10
+ gtt_html: GttHtmlStrategy,
11
+ }
12
+
13
+ def segments
14
+ strategy.segmentize(content)
15
+ end
16
+
17
+ private
18
+
19
+ def tmx
20
+ @options[:tmx] or raise 'tmx option is not set'
21
+ end
22
+
23
+ def lang
24
+ @options[:lang] or raise 'lang option is not set'
25
+ end
26
+
27
+ def format
28
+ if STRATEGIES.has_key?(options[:format])
29
+ options[:format]
30
+ else
31
+ :text
32
+ end
33
+ end
34
+
35
+ def strategy
36
+ STRATEGIES[format].new(tmx, lang)
37
+ end
38
+
39
+ end
40
+ end
@@ -1,7 +1,5 @@
1
1
  require 'mem'
2
- require 'konjak/translator/gtt_html_translate'
3
- require 'konjak/translator/text_translate'
4
- require 'konjak/translator/translated_string'
2
+ require 'konjak/tmx_segmentor'
5
3
 
6
4
  module Konjak
7
5
  class Translator
@@ -17,49 +15,24 @@ module Konjak
17
15
  @options = options
18
16
  end
19
17
 
20
- def translate(doc)
21
- translated_docs = [doc.dup]
22
- translation_units.each do |tu|
23
- translated_docs.map! { |text|
24
- next text if text.is_a?(TranslatedString)
25
-
26
- env = translate_env.dup
27
- env.local_variable_set(:tu, tu)
28
- env.local_variable_set(:src_lang, src_lang)
29
- env.local_variable_set(:target_lang, target_lang)
30
- env.local_variable_set(:text, text)
31
- eval('tu.translate(src_lang, target_lang, text)', env)
32
- }.flatten!
18
+ def translate(content)
19
+ segmentor(content).segments.map do |text|
20
+ next text unless text.is_a?(TmxSegmentor::SegmentString)
21
+ source_segment = text.segment
22
+ target_segment = source_segment.translation_unit.variant(target_lang).segment
23
+ target_segment.interpolate_gtt_tags(source_segment.extract_gtt_tags_from(text))
33
24
  end
34
- translated_docs
35
25
  end
36
26
 
37
27
  private
38
28
 
39
- TRANSLATE_ENVS= {
40
- text: Class.new { using TextTranslate; break binding },
41
- gtt_html: Class.new { using GttHtmlTranslate; break binding }
42
- }
43
-
44
- def format
45
- if TRANSLATE_ENVS.has_key?(options[:format])
46
- options[:format]
47
- else
48
- :text
49
- end
50
- end
51
-
52
- def translate_env
53
- TRANSLATE_ENVS[format]
54
- end
55
-
56
- def translation_units
57
- tmx.body.translation_units.select { |tu|
58
- tu.has_translation?(src_lang, target_lang)
59
- }.sort_by {|tu|
60
- -tu.variant(src_lang).segment.text.length
61
- }
29
+ def segmentor(content)
30
+ TmxSegmentor.new(
31
+ content,
32
+ tmx: tmx,
33
+ lang: src_lang,
34
+ format: options[:format]
35
+ )
62
36
  end
63
- memoize :translation_units
64
37
  end
65
38
  end
@@ -1,3 +1,3 @@
1
1
  module Konjak
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
data/lib/konjak.rb CHANGED
@@ -34,14 +34,20 @@ require 'konjak/unknown_tag'
34
34
  # translator
35
35
  require 'konjak/translator'
36
36
 
37
+ # segmentor
38
+ require 'konjak/segmentor'
39
+ require 'konjak/html_segmentor'
40
+ require 'konjak/polytex_segmentor'
41
+ require 'konjak/tmx_segmentor'
42
+
37
43
  module Konjak
38
44
  class << self
39
45
  def parse(xml, **options)
40
- Parser.new.parse(xml, **options)
46
+ Parser.new(**options).parse(xml)
41
47
  end
42
48
 
43
49
  def translate(doc, xml_or_tmx, src_lang, target_lang, **options)
44
- tmx = xml_or_tmx.kind_of?(Tmx) ? xml_or_tmx : parse(xml_or_tmx)
50
+ tmx = xml_or_tmx.kind_of?(Tmx) ? xml_or_tmx : parse(xml_or_tmx, **options)
45
51
  Translator.new(tmx, src_lang, target_lang, **options).translate(doc)
46
52
  end
47
53
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: konjak
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Seiei Higa
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-06-05 00:00:00.000000000 Z
11
+ date: 2015-06-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mem
@@ -134,7 +134,7 @@ files:
134
134
  - ".rspec"
135
135
  - ".travis.yml"
136
136
  - Gemfile
137
- - LICENSE.txt
137
+ - LICENSE.md
138
138
  - README.md
139
139
  - Rakefile
140
140
  - bin/konjak
@@ -147,24 +147,30 @@ files:
147
147
  - lib/konjak/end_paired_tag.rb
148
148
  - lib/konjak/header.rb
149
149
  - lib/konjak/highlight.rb
150
+ - lib/konjak/html_segmentor.rb
150
151
  - lib/konjak/inline_element.rb
151
152
  - lib/konjak/isolated_tag.rb
152
153
  - lib/konjak/map.rb
153
154
  - lib/konjak/note.rb
154
155
  - lib/konjak/parser.rb
155
156
  - lib/konjak/placeholder.rb
157
+ - lib/konjak/polytex_segmentor.rb
156
158
  - lib/konjak/property.rb
157
159
  - lib/konjak/segment.rb
160
+ - lib/konjak/segment/gtt.rb
161
+ - lib/konjak/segmentor.rb
158
162
  - lib/konjak/structural_element.rb
159
163
  - lib/konjak/sub_flow.rb
160
164
  - lib/konjak/text.rb
161
165
  - lib/konjak/tmx.rb
166
+ - lib/konjak/tmx_segmentor.rb
167
+ - lib/konjak/tmx_segmentor/gtt_html_strategy.rb
168
+ - lib/konjak/tmx_segmentor/segment_string.rb
169
+ - lib/konjak/tmx_segmentor/strategy.rb
170
+ - lib/konjak/tmx_segmentor/text_strategy.rb
162
171
  - lib/konjak/translation_unit.rb
163
172
  - lib/konjak/translation_unit_variant.rb
164
173
  - lib/konjak/translator.rb
165
- - lib/konjak/translator/gtt_html_translate.rb
166
- - lib/konjak/translator/text_translate.rb
167
- - lib/konjak/translator/translated_string.rb
168
174
  - lib/konjak/unknown_tag.rb
169
175
  - lib/konjak/user_defined_encoding.rb
170
176
  - lib/konjak/version.rb
@@ -1,47 +0,0 @@
1
- module Konjak
2
- class Translator
3
- module GttHtmlTranslate
4
- refine(Text) do
5
- def gtt_tag_ns
6
- scan(/\{(\d+)\}/).flatten.uniq
7
- end
8
-
9
- def compile_gtt_html_pattern
10
- regexp = Regexp.escape(self)
11
- gtt_tag_ns.each do |n|
12
- regexp = regexp.sub(/\\\{#{n}\\\}/) { "(?<n#{n}><(?<_#{n}>\\w+)[^>]*>)" }
13
- regexp = regexp.gsub(/\\\{#{n}\\\}/) { "\\k<n#{n}>" }
14
- regexp = regexp.gsub(/\\\{\/#{n}\\\}/) { "</\\k<_#{n}>>" }
15
- end
16
- Regexp.compile(regexp)
17
- end
18
-
19
- def interpolate_gtt_html_pattern(match_data)
20
- new_text = dup
21
- gtt_tag_ns.each do |n|
22
- new_text = new_text.gsub("{#{n}}", match_data["n#{n}"])
23
- new_text = new_text.gsub("{/#{n}}", "</#{match_data["_#{n}"]}>")
24
- end
25
- new_text
26
- end
27
- end
28
-
29
- refine(TranslationUnit) do
30
- def translate(src_lang, target_lang, text)
31
- pattern = variant(src_lang).segment.text.compile_gtt_html_pattern
32
- target_text = variant(target_lang).segment.text
33
-
34
- texts = []
35
- while true
36
- head, match, tail = text.partition(pattern)
37
- break if match.empty?
38
- texts << head unless head.empty?
39
- texts << TranslatedString.new(target_text.interpolate_gtt_html_pattern($~))
40
- text = tail
41
- end
42
- texts << text
43
- end
44
- end
45
- end
46
- end
47
- end
@@ -1,22 +0,0 @@
1
- module Konjak
2
- class Translator
3
- module TextTranslate
4
- refine(TranslationUnit) do
5
- def translate(src_lang, target_lang, text)
6
- s = variant(src_lang).segment.text
7
- t = variant(target_lang).segment.text
8
-
9
- texts = []
10
- while true
11
- head, match, tail = text.partition(s)
12
- break if match.empty?
13
- texts << head unless head.empty?
14
- texts << TranslatedString.new(t)
15
- text = tail
16
- end
17
- texts << text
18
- end
19
- end
20
- end
21
- end
22
- end
@@ -1,6 +0,0 @@
1
- module Konjak
2
- class Translator
3
- class TranslatedString < String
4
- end
5
- end
6
- end