konjak 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ea6b25de9fe1bb666b02b8db7a3253619fe6cc5b
4
- data.tar.gz: 4bf4b8cdc708b5904c16e8d3b656c37b15c26f4e
3
+ metadata.gz: 0150635f9f3db7b2d052aee30b1db085162eceaa
4
+ data.tar.gz: f2a5b91c9dc7a9a99d84c9ac979a67893959ab56
5
5
  SHA512:
6
- metadata.gz: 94607b5df832840753b9a863dec409c98a283d9be758f40be2af1522cdd0815fe7615390f868fd2afa89133046a9418de25b1c568146cc5f3ceb491854c423c6
7
- data.tar.gz: bef5ef91e252bfadde9f6b8f6cb8fd6674efd4ae4e2d4c28a4f65e88c9a0330e365faf7fe4ced678a56a876f185dd8dac0d87f3877ef4d13f7d12be4939a94f1
6
+ metadata.gz: add182fac41dada9dce73a25f1a7b90ca1663fb4e52c0ae836e190cbd0bc3795356917b7f88918af1a61c8d2f7dc73ca09c6a268398f5c6f3a78d1c62a18d365
7
+ data.tar.gz: 21a97c4060b425a7ebfc9b3df5b6edd4de26d9c2ec874df619155257067f844b70a7efc71707cb1bb89a53b9a64015ff0aee739256b4df9acbf1a5062f85d873
@@ -1,6 +1,6 @@
1
- Copyright (c) 2014 Seiei Higa
1
+ Copyright © 2014-2015 [YassLab](http://yasslab.jp)
2
2
 
3
- MIT License
3
+ The MIT License
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining
6
6
  a copy of this software and associated documentation files (the
data/README.md CHANGED
@@ -31,3 +31,9 @@ Or install it yourself as:
31
31
  3. Commit your changes (`git commit -am 'Add some feature'`)
32
32
  4. Push to the branch (`git push origin my-new-feature`)
33
33
  5. Create new Pull Request
34
+
35
+ ## License
36
+
37
+ Copyright © 2015 [YassLab](http://yasslab.jp)
38
+
39
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
@@ -0,0 +1,27 @@
1
+ module Konjak
2
+ class HtmlSegmentor < Segmentor
3
+ def segments
4
+ segments = [content.dup]
5
+
6
+ begin
7
+ size = segments.size
8
+
9
+ segments_patterns = [
10
+ %r{<(?<start>p|h1|h2|h3|h4|h5|h6|li|title|td)>(.*?)</\k<start>>}m,
11
+ %r{<(?<start>p|h1|h2|h3|h4|h5|h6|li|title|td) [^>]*?>(.*?)</\k<start>>}m,
12
+ %r{<div>(.*?)</div>}m,
13
+ %r{<div [^>]*?>(.*?)</div>}m
14
+ ]
15
+ segments_patterns.each do |pattern|
16
+ segments.map! do |s|
17
+ s.partition(pattern)
18
+ end
19
+ segments.flatten!
20
+ segments.reject!(&:empty?)
21
+ end
22
+ end while segments.size != size
23
+
24
+ segments
25
+ end
26
+ end
27
+ end
data/lib/konjak/parser.rb CHANGED
@@ -2,7 +2,13 @@ require 'nokogiri'
2
2
 
3
3
  module Konjak
4
4
  class Parser
5
- def parse(xml, gtt: false)
5
+ attr_accessor :gtt
6
+
7
+ def initialize(gtt: false)
8
+ @gtt = gtt
9
+ end
10
+
11
+ def parse(xml)
6
12
  if gtt
7
13
  # FIXME
8
14
  xml = xml.gsub(/&amp;(#\d+|#x[0-9a-fA-F]+|[0-9a-zA-Z]+);/) { "&#{$1};" }
@@ -0,0 +1,40 @@
1
+ module Konjak
2
+ class PolytexSegmentor < Segmentor
3
+ def segments
4
+ segments = [content.dup]
5
+
6
+ begin
7
+ size = segments.size
8
+
9
+ segments_patterns = [
10
+ /\\begin\{(?<start>[^\}]+)\}([\n.]*?)\\end\{\k<start>\}/m,
11
+ /(?<=\\chapter\{)[^\}]+(?=\})/,
12
+ /(?<=\\section\{)[^\}]+(?=\})/,
13
+ /(?<=\\subsection\{)[^\}]+(?=\})/,
14
+ /\\footnote\{(?<gr>\\(?!footnote)[^\{]+\{[^\}]+\}(?:\{[^\}]+\})?\g<gr>|[^{])+\}/m,
15
+ /(?<=\\footnote\{)(?<gr>\\(?!footnote)[^\{]+\{[^\}]+\}(?:\{[^\}]+\})?\g<gr>|[^{])+(?=\})/m,
16
+ /(?<=\\codecaption\{).+(?= \\|\}$)/,
17
+ /(?<=\\caption\{).+(?=\\label\{.*\}\}$)/,
18
+ /(?<=\n)^.*$(?=\n)/m,
19
+ /# .*$/,
20
+ /(?<=^).+?[\.\?\!](?= |\n|\t)/,
21
+ /(?<=\()[^\.\n]+[\.\?\!](?=\))/,
22
+ /^ (?=[\w\\]+)/,
23
+ /^\s+% .*$/,
24
+ /^$/,
25
+ /\\noindent /,
26
+ /\\item /,
27
+ ]
28
+ segments_patterns.each do |pattern|
29
+ segments.map! do |s|
30
+ s.partition(pattern)
31
+ end
32
+ segments.flatten!
33
+ segments.reject!(&:empty?)
34
+ end
35
+ end while segments.size != size
36
+
37
+ segments
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,39 @@
1
+ module Konjak
2
+ class Segment < StructuralElement
3
+ module GTT
4
+ Tag = Struct.new(:gtt, :html)
5
+
6
+ def compile_gtt_html_pattern
7
+ regexp = Regexp.escape(text)
8
+ gtt_tag_ns.each do |n|
9
+ regexp = regexp.sub(/\\\{#{n}\\\}/) { "(?<n#{n}><(?<_#{n}>\\w+)[^>]*>)" }
10
+ regexp = regexp.gsub(/\\\{#{n}\\\}/) { "\\k<n#{n}>" }
11
+ regexp = regexp.gsub(/\\\{\/#{n}\\\}/) { "</\\k<_#{n}>>" }
12
+ end
13
+ Regexp.compile(regexp)
14
+ end
15
+
16
+ def extract_gtt_tags_from(text)
17
+ m = text.match(compile_gtt_html_pattern)
18
+ gtt_tag_ns.each_with_object([]) do |n, tags|
19
+ tags << Tag.new("{#{n}}", m["n#{n}"])
20
+ tags << Tag.new("{/#{n}}", "</#{m["_#{n}"]}>")
21
+ end
22
+ end
23
+
24
+ def interpolate_gtt_tags(tags)
25
+ new_text = self.text.dup
26
+ tags.each do |tag|
27
+ new_text = new_text.gsub(tag[:gtt], tag[:html])
28
+ end
29
+ new_text
30
+ end
31
+
32
+ private
33
+
34
+ def gtt_tag_ns
35
+ text.scan(/\{(\d+)\}/).flatten.uniq
36
+ end
37
+ end
38
+ end
39
+ end
@@ -1,6 +1,10 @@
1
+ require 'konjak/segment/gtt'
2
+
1
3
  module Konjak
2
4
  # container
3
5
  class Segment < StructuralElement
6
+ include GTT
7
+
4
8
  # children
5
9
  def text
6
10
  Text.new(super)
@@ -10,5 +14,13 @@ module Konjak
10
14
  def can_contain?(element)
11
15
  [Text, BeginPairedTag, EndPairedTag, IsolatedTag, Placeholder, Highlight].any? {|c| c === element }
12
16
  end
17
+
18
+ def translation_unit
19
+ TranslationUnit.new(translation_unit_variant.parent)
20
+ end
21
+
22
+ def translation_unit_variant
23
+ TranslationUnitVariant.new(parent)
24
+ end
13
25
  end
14
26
  end
@@ -0,0 +1,10 @@
1
+ module Konjak
2
+ class Segmentor
3
+ attr_accessor :content, :options
4
+
5
+ def initialize(content, options)
6
+ @content = content
7
+ @options = options
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,28 @@
1
+ require 'konjak/tmx_segmentor/segment_string'
2
+ require 'konjak/tmx_segmentor/strategy'
3
+
4
+ module Konjak
5
+ class TmxSegmentor < Segmentor
6
+ class GttHtmlStrategy < Strategy
7
+
8
+ private
9
+
10
+ def split(translation_unit, text)
11
+ segment = translation_unit.variant(@lang).segment
12
+ pattern = segment.compile_gtt_html_pattern
13
+
14
+ texts = []
15
+ while true
16
+ head, match, tail = text.partition(pattern)
17
+ break if match.empty?
18
+ texts << head unless head.empty?
19
+
20
+ texts << SegmentString.new(match, segment)
21
+
22
+ text = tail
23
+ end
24
+ texts << text
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,14 @@
1
+ require 'konjak/segmentor'
2
+
3
+ module Konjak
4
+ class TmxSegmentor < Segmentor
5
+ class SegmentString < String
6
+ attr_accessor :segment
7
+
8
+ def initialize(str, segment)
9
+ super(str)
10
+ @segment = segment
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,37 @@
1
+ require 'mem'
2
+ require 'konjak/segmentor'
3
+ require 'konjak/tmx_segmentor/segment_string'
4
+
5
+ module Konjak
6
+ class TmxSegmentor < Segmentor
7
+ class Strategy
8
+ include Mem
9
+
10
+ def initialize(tmx, lang)
11
+ @tmx = tmx
12
+ @lang = lang
13
+ end
14
+
15
+ def segmentize(text)
16
+ segments = [text]
17
+ translation_units.each do |translation_unit|
18
+ segments.map! {|text|
19
+ next text if text.is_a?(SegmentString)
20
+
21
+ split(translation_unit, text)
22
+ }.flatten!
23
+ end
24
+ segments
25
+ end
26
+
27
+ private
28
+
29
+ def translation_units
30
+ @tmx.body.translation_units.sort_by {|tu|
31
+ -tu.variant(@lang).segment.text.length
32
+ }
33
+ end
34
+ memoize :translation_units
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,28 @@
1
+ require 'konjak/tmx_segmentor/segment_string'
2
+ require 'konjak/tmx_segmentor/strategy'
3
+
4
+ module Konjak
5
+ class TmxSegmentor < Segmentor
6
+ class TextStrategy < Strategy
7
+
8
+ private
9
+
10
+ def split(translation_unit, text)
11
+ segment = translation_unit.variant(@lang).segment
12
+ segment_text = segment.text
13
+
14
+ texts = []
15
+ while true
16
+ head, match, tail = text.partition(segment_text)
17
+ break if match.empty?
18
+ texts << head unless head.empty?
19
+
20
+ texts << SegmentString.new(match, segment)
21
+
22
+ text = tail
23
+ end
24
+ texts << text
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,40 @@
1
+ require 'konjak/segmentor'
2
+ require 'konjak/tmx_segmentor/gtt_html_strategy'
3
+ require 'konjak/tmx_segmentor/text_strategy'
4
+
5
+ module Konjak
6
+ class TmxSegmentor < Segmentor
7
+
8
+ STRATEGIES = {
9
+ text: TextStrategy,
10
+ gtt_html: GttHtmlStrategy,
11
+ }
12
+
13
+ def segments
14
+ strategy.segmentize(content)
15
+ end
16
+
17
+ private
18
+
19
+ def tmx
20
+ @options[:tmx] or raise 'tmx option is not set'
21
+ end
22
+
23
+ def lang
24
+ @options[:lang] or raise 'lang option is not set'
25
+ end
26
+
27
+ def format
28
+ if STRATEGIES.has_key?(options[:format])
29
+ options[:format]
30
+ else
31
+ :text
32
+ end
33
+ end
34
+
35
+ def strategy
36
+ STRATEGIES[format].new(tmx, lang)
37
+ end
38
+
39
+ end
40
+ end
@@ -1,7 +1,5 @@
1
1
  require 'mem'
2
- require 'konjak/translator/gtt_html_translate'
3
- require 'konjak/translator/text_translate'
4
- require 'konjak/translator/translated_string'
2
+ require 'konjak/tmx_segmentor'
5
3
 
6
4
  module Konjak
7
5
  class Translator
@@ -17,49 +15,24 @@ module Konjak
17
15
  @options = options
18
16
  end
19
17
 
20
- def translate(doc)
21
- translated_docs = [doc.dup]
22
- translation_units.each do |tu|
23
- translated_docs.map! { |text|
24
- next text if text.is_a?(TranslatedString)
25
-
26
- env = translate_env.dup
27
- env.local_variable_set(:tu, tu)
28
- env.local_variable_set(:src_lang, src_lang)
29
- env.local_variable_set(:target_lang, target_lang)
30
- env.local_variable_set(:text, text)
31
- eval('tu.translate(src_lang, target_lang, text)', env)
32
- }.flatten!
18
+ def translate(content)
19
+ segmentor(content).segments.map do |text|
20
+ next text unless text.is_a?(TmxSegmentor::SegmentString)
21
+ source_segment = text.segment
22
+ target_segment = source_segment.translation_unit.variant(target_lang).segment
23
+ target_segment.interpolate_gtt_tags(source_segment.extract_gtt_tags_from(text))
33
24
  end
34
- translated_docs
35
25
  end
36
26
 
37
27
  private
38
28
 
39
- TRANSLATE_ENVS= {
40
- text: Class.new { using TextTranslate; break binding },
41
- gtt_html: Class.new { using GttHtmlTranslate; break binding }
42
- }
43
-
44
- def format
45
- if TRANSLATE_ENVS.has_key?(options[:format])
46
- options[:format]
47
- else
48
- :text
49
- end
50
- end
51
-
52
- def translate_env
53
- TRANSLATE_ENVS[format]
54
- end
55
-
56
- def translation_units
57
- tmx.body.translation_units.select { |tu|
58
- tu.has_translation?(src_lang, target_lang)
59
- }.sort_by {|tu|
60
- -tu.variant(src_lang).segment.text.length
61
- }
29
+ def segmentor(content)
30
+ TmxSegmentor.new(
31
+ content,
32
+ tmx: tmx,
33
+ lang: src_lang,
34
+ format: options[:format]
35
+ )
62
36
  end
63
- memoize :translation_units
64
37
  end
65
38
  end
@@ -1,3 +1,3 @@
1
1
  module Konjak
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
data/lib/konjak.rb CHANGED
@@ -34,14 +34,20 @@ require 'konjak/unknown_tag'
34
34
  # translator
35
35
  require 'konjak/translator'
36
36
 
37
+ # segmentor
38
+ require 'konjak/segmentor'
39
+ require 'konjak/html_segmentor'
40
+ require 'konjak/polytex_segmentor'
41
+ require 'konjak/tmx_segmentor'
42
+
37
43
  module Konjak
38
44
  class << self
39
45
  def parse(xml, **options)
40
- Parser.new.parse(xml, **options)
46
+ Parser.new(**options).parse(xml)
41
47
  end
42
48
 
43
49
  def translate(doc, xml_or_tmx, src_lang, target_lang, **options)
44
- tmx = xml_or_tmx.kind_of?(Tmx) ? xml_or_tmx : parse(xml_or_tmx)
50
+ tmx = xml_or_tmx.kind_of?(Tmx) ? xml_or_tmx : parse(xml_or_tmx, **options)
45
51
  Translator.new(tmx, src_lang, target_lang, **options).translate(doc)
46
52
  end
47
53
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: konjak
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Seiei Higa
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-06-05 00:00:00.000000000 Z
11
+ date: 2015-06-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mem
@@ -134,7 +134,7 @@ files:
134
134
  - ".rspec"
135
135
  - ".travis.yml"
136
136
  - Gemfile
137
- - LICENSE.txt
137
+ - LICENSE.md
138
138
  - README.md
139
139
  - Rakefile
140
140
  - bin/konjak
@@ -147,24 +147,30 @@ files:
147
147
  - lib/konjak/end_paired_tag.rb
148
148
  - lib/konjak/header.rb
149
149
  - lib/konjak/highlight.rb
150
+ - lib/konjak/html_segmentor.rb
150
151
  - lib/konjak/inline_element.rb
151
152
  - lib/konjak/isolated_tag.rb
152
153
  - lib/konjak/map.rb
153
154
  - lib/konjak/note.rb
154
155
  - lib/konjak/parser.rb
155
156
  - lib/konjak/placeholder.rb
157
+ - lib/konjak/polytex_segmentor.rb
156
158
  - lib/konjak/property.rb
157
159
  - lib/konjak/segment.rb
160
+ - lib/konjak/segment/gtt.rb
161
+ - lib/konjak/segmentor.rb
158
162
  - lib/konjak/structural_element.rb
159
163
  - lib/konjak/sub_flow.rb
160
164
  - lib/konjak/text.rb
161
165
  - lib/konjak/tmx.rb
166
+ - lib/konjak/tmx_segmentor.rb
167
+ - lib/konjak/tmx_segmentor/gtt_html_strategy.rb
168
+ - lib/konjak/tmx_segmentor/segment_string.rb
169
+ - lib/konjak/tmx_segmentor/strategy.rb
170
+ - lib/konjak/tmx_segmentor/text_strategy.rb
162
171
  - lib/konjak/translation_unit.rb
163
172
  - lib/konjak/translation_unit_variant.rb
164
173
  - lib/konjak/translator.rb
165
- - lib/konjak/translator/gtt_html_translate.rb
166
- - lib/konjak/translator/text_translate.rb
167
- - lib/konjak/translator/translated_string.rb
168
174
  - lib/konjak/unknown_tag.rb
169
175
  - lib/konjak/user_defined_encoding.rb
170
176
  - lib/konjak/version.rb
@@ -1,47 +0,0 @@
1
- module Konjak
2
- class Translator
3
- module GttHtmlTranslate
4
- refine(Text) do
5
- def gtt_tag_ns
6
- scan(/\{(\d+)\}/).flatten.uniq
7
- end
8
-
9
- def compile_gtt_html_pattern
10
- regexp = Regexp.escape(self)
11
- gtt_tag_ns.each do |n|
12
- regexp = regexp.sub(/\\\{#{n}\\\}/) { "(?<n#{n}><(?<_#{n}>\\w+)[^>]*>)" }
13
- regexp = regexp.gsub(/\\\{#{n}\\\}/) { "\\k<n#{n}>" }
14
- regexp = regexp.gsub(/\\\{\/#{n}\\\}/) { "</\\k<_#{n}>>" }
15
- end
16
- Regexp.compile(regexp)
17
- end
18
-
19
- def interpolate_gtt_html_pattern(match_data)
20
- new_text = dup
21
- gtt_tag_ns.each do |n|
22
- new_text = new_text.gsub("{#{n}}", match_data["n#{n}"])
23
- new_text = new_text.gsub("{/#{n}}", "</#{match_data["_#{n}"]}>")
24
- end
25
- new_text
26
- end
27
- end
28
-
29
- refine(TranslationUnit) do
30
- def translate(src_lang, target_lang, text)
31
- pattern = variant(src_lang).segment.text.compile_gtt_html_pattern
32
- target_text = variant(target_lang).segment.text
33
-
34
- texts = []
35
- while true
36
- head, match, tail = text.partition(pattern)
37
- break if match.empty?
38
- texts << head unless head.empty?
39
- texts << TranslatedString.new(target_text.interpolate_gtt_html_pattern($~))
40
- text = tail
41
- end
42
- texts << text
43
- end
44
- end
45
- end
46
- end
47
- end
@@ -1,22 +0,0 @@
1
- module Konjak
2
- class Translator
3
- module TextTranslate
4
- refine(TranslationUnit) do
5
- def translate(src_lang, target_lang, text)
6
- s = variant(src_lang).segment.text
7
- t = variant(target_lang).segment.text
8
-
9
- texts = []
10
- while true
11
- head, match, tail = text.partition(s)
12
- break if match.empty?
13
- texts << head unless head.empty?
14
- texts << TranslatedString.new(t)
15
- text = tail
16
- end
17
- texts << text
18
- end
19
- end
20
- end
21
- end
22
- end
@@ -1,6 +0,0 @@
1
- module Konjak
2
- class Translator
3
- class TranslatedString < String
4
- end
5
- end
6
- end