konjak 0.0.12 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/lib/konjak/segment/gtt.rb +12 -4
 - data/lib/konjak/segment.rb +7 -1
 - data/lib/konjak/tmx_segmentor/gtt_html_strategy.rb +2 -15
 - data/lib/konjak/tmx_segmentor/strategy.rb +21 -1
 - data/lib/konjak/tmx_segmentor/text_strategy.rb +2 -15
 - data/lib/konjak/version.rb +1 -1
 - data/spec/konjak_translate_spec.rb +5 -3
 - metadata +1 -1
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA1:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: cd181dadc5d9128c27be3312f50274acffbcfce8
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 29fa12fb01c4409b7587496413b0fda0c46ab0a4
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 188010cd203698454c7f2204c20e6a6e19a18a7eb25640d0c9555e011568ff87fe96ba521ab04c1c42975c59636e634992144a825980f9398abb12e72f913996
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: ba9cf03f27fe8599815060771ea437fb852b95e0a5f3f1e06524e9ef0c5930b0a373c9fca08cbffd83c44b5238acd51403058775a83fe328a3550c7424cd88ee
         
     | 
    
        data/lib/konjak/segment/gtt.rb
    CHANGED
    
    | 
         @@ -1,18 +1,26 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'mem'
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
       1 
3 
     | 
    
         
             
            module Konjak
         
     | 
| 
       2 
4 
     | 
    
         
             
              class Segment < StructuralElement
         
     | 
| 
       3 
5 
     | 
    
         
             
                module GTT
         
     | 
| 
      
 6 
     | 
    
         
            +
                  include Mem
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
       4 
8 
     | 
    
         
             
                  Tag = Struct.new(:gtt, :html)
         
     | 
| 
       5 
9 
     | 
    
         | 
| 
       6 
10 
     | 
    
         
             
                  def compile_gtt_html_pattern
         
     | 
| 
       7 
11 
     | 
    
         
             
                    regexp = Regexp.escape(text)
         
     | 
| 
       8 
12 
     | 
    
         
             
                    gtt_tag_ns.each do |n|
         
     | 
| 
       9 
     | 
    
         
            -
                      regexp 
     | 
| 
       10 
     | 
    
         
            -
                      regexp 
     | 
| 
       11 
     | 
    
         
            -
                      regexp 
     | 
| 
      
 13 
     | 
    
         
            +
                      regexp.sub!(/\\\{#{n}\\\}/)    { "(?<n#{n}><(?<_#{n}>\\w+)[^>]*>)" }
         
     | 
| 
      
 14 
     | 
    
         
            +
                      regexp.gsub!(/\\\{#{n}\\\}/)   { "\\k<n#{n}>" }
         
     | 
| 
      
 15 
     | 
    
         
            +
                      regexp.gsub!(/\\\{\/#{n}\\\}/) { "</\\k<_#{n}>>" }
         
     | 
| 
       12 
16 
     | 
    
         
             
                    end
         
     | 
| 
       13 
     | 
    
         
            -
                    regexp 
     | 
| 
      
 17 
     | 
    
         
            +
                    regexp.gsub!(/(?:\\\s|\n)/m)        { '\s' }
         
     | 
| 
      
 18 
     | 
    
         
            +
                    regexp.gsub!(/(?:\\s)+/m)           {|s| s + '++' }
         
     | 
| 
      
 19 
     | 
    
         
            +
                    regexp.gsub!(/^(?<s>(?:\\s)+)\+\+/) { $~[:s] }
         
     | 
| 
      
 20 
     | 
    
         
            +
                    regexp.gsub!(/(?<s>(?:\\s)+)\+\+$/) { $~[:s] }
         
     | 
| 
       14 
21 
     | 
    
         
             
                    Regexp.compile(regexp)
         
     | 
| 
       15 
22 
     | 
    
         
             
                  end
         
     | 
| 
      
 23 
     | 
    
         
            +
                  memoize :compile_gtt_html_pattern
         
     | 
| 
       16 
24 
     | 
    
         | 
| 
       17 
25 
     | 
    
         
             
                  def extract_gtt_tags_from(text)
         
     | 
| 
       18 
26 
     | 
    
         
             
                    m = text.match(compile_gtt_html_pattern)
         
     | 
    
        data/lib/konjak/segment.rb
    CHANGED
    
    | 
         @@ -1,9 +1,11 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            require 'konjak/segment/gtt'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'mem'
         
     | 
| 
       2 
3 
     | 
    
         | 
| 
       3 
4 
     | 
    
         
             
            module Konjak
         
     | 
| 
       4 
5 
     | 
    
         
             
              # container
         
     | 
| 
       5 
6 
     | 
    
         
             
              class Segment < StructuralElement
         
     | 
| 
       6 
7 
     | 
    
         
             
                include GTT
         
     | 
| 
      
 8 
     | 
    
         
            +
                include Mem
         
     | 
| 
       7 
9 
     | 
    
         | 
| 
       8 
10 
     | 
    
         
             
                # children
         
     | 
| 
       9 
11 
     | 
    
         
             
                def text
         
     | 
| 
         @@ -17,9 +19,13 @@ module Konjak 
     | 
|
| 
       17 
19 
     | 
    
         | 
| 
       18 
20 
     | 
    
         
             
                def compile_pattern
         
     | 
| 
       19 
21 
     | 
    
         
             
                  regexp = Regexp.escape(text)
         
     | 
| 
       20 
     | 
    
         
            -
                  regexp = regexp.gsub(/(?:\\\s|\n)/m) 
     | 
| 
      
 22 
     | 
    
         
            +
                  regexp = regexp.gsub(/(?:\\\s|\n)/m)        { '\s' }
         
     | 
| 
      
 23 
     | 
    
         
            +
                  regexp = regexp.gsub(/(?:\\s)+/m)           {|s| s + '++' }
         
     | 
| 
      
 24 
     | 
    
         
            +
                  regexp = regexp.gsub(/^(?<s>(?:\\s)+)\+\+/) { $~[:s] }
         
     | 
| 
      
 25 
     | 
    
         
            +
                  regexp = regexp.gsub(/(?<s>(?:\\s)+)\+\+$/) { $~[:s] }
         
     | 
| 
       21 
26 
     | 
    
         
             
                  Regexp.compile(regexp)
         
     | 
| 
       22 
27 
     | 
    
         
             
                end
         
     | 
| 
      
 28 
     | 
    
         
            +
                memoize :compile_pattern
         
     | 
| 
       23 
29 
     | 
    
         | 
| 
       24 
30 
     | 
    
         
             
                def translation_unit
         
     | 
| 
       25 
31 
     | 
    
         
             
                  TranslationUnit.new(translation_unit_variant.parent)
         
     | 
| 
         @@ -7,21 +7,8 @@ module Konjak 
     | 
|
| 
       7 
7 
     | 
    
         | 
| 
       8 
8 
     | 
    
         
             
                  private
         
     | 
| 
       9 
9 
     | 
    
         | 
| 
       10 
     | 
    
         
            -
                  def  
     | 
| 
       11 
     | 
    
         
            -
                    segment 
     | 
| 
       12 
     | 
    
         
            -
                    pattern = segment.compile_gtt_html_pattern
         
     | 
| 
       13 
     | 
    
         
            -
             
     | 
| 
       14 
     | 
    
         
            -
                    texts = []
         
     | 
| 
       15 
     | 
    
         
            -
                    while true
         
     | 
| 
       16 
     | 
    
         
            -
                      head, match, tail = text.partition(pattern)
         
     | 
| 
       17 
     | 
    
         
            -
                      break if match.empty? || text.length < min_segment_length
         
     | 
| 
       18 
     | 
    
         
            -
                      texts << head unless head.empty?
         
     | 
| 
       19 
     | 
    
         
            -
             
     | 
| 
       20 
     | 
    
         
            -
                      texts << SegmentString.new(match, segment)
         
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
       22 
     | 
    
         
            -
                      text = tail
         
     | 
| 
       23 
     | 
    
         
            -
                    end
         
     | 
| 
       24 
     | 
    
         
            -
                    texts << text
         
     | 
| 
      
 10 
     | 
    
         
            +
                  def compile_pattern(segment)
         
     | 
| 
      
 11 
     | 
    
         
            +
                    segment.compile_gtt_html_pattern
         
     | 
| 
       25 
12 
     | 
    
         
             
                  end
         
     | 
| 
       26 
13 
     | 
    
         
             
                end
         
     | 
| 
       27 
14 
     | 
    
         
             
              end
         
     | 
| 
         @@ -16,10 +16,13 @@ module Konjak 
     | 
|
| 
       16 
16 
     | 
    
         
             
                  def segmentize(text)
         
     | 
| 
       17 
17 
     | 
    
         
             
                    segments = [text]
         
     | 
| 
       18 
18 
     | 
    
         
             
                    translation_units.each do |translation_unit|
         
     | 
| 
      
 19 
     | 
    
         
            +
                      segment = translation_unit.variant(@lang).segment
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
       19 
21 
     | 
    
         
             
                      segments.map! {|text|
         
     | 
| 
      
 22 
     | 
    
         
            +
                        next text if text.length < min_segment_length
         
     | 
| 
       20 
23 
     | 
    
         
             
                        next text if text.is_a?(SegmentString)
         
     | 
| 
       21 
24 
     | 
    
         | 
| 
       22 
     | 
    
         
            -
                        split( 
     | 
| 
      
 25 
     | 
    
         
            +
                        split(segment, text)
         
     | 
| 
       23 
26 
     | 
    
         
             
                      }.flatten!
         
     | 
| 
       24 
27 
     | 
    
         
             
                    end
         
     | 
| 
       25 
28 
     | 
    
         
             
                    segments
         
     | 
| 
         @@ -35,6 +38,23 @@ module Konjak 
     | 
|
| 
       35 
38 
     | 
    
         
             
                    @options[:min_segment_length]
         
     | 
| 
       36 
39 
     | 
    
         
             
                  end
         
     | 
| 
       37 
40 
     | 
    
         | 
| 
      
 41 
     | 
    
         
            +
                  def split(segment, text)
         
     | 
| 
      
 42 
     | 
    
         
            +
                    texts = []
         
     | 
| 
      
 43 
     | 
    
         
            +
                    while true
         
     | 
| 
      
 44 
     | 
    
         
            +
                      break if text.length < min_segment_length
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
                      head, match, tail = text.partition(compile_pattern(segment))
         
     | 
| 
      
 47 
     | 
    
         
            +
                      break if match.empty?
         
     | 
| 
      
 48 
     | 
    
         
            +
             
     | 
| 
      
 49 
     | 
    
         
            +
                      texts << head unless head.empty?
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
                      texts << SegmentString.new(match, segment)
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
                      text = tail
         
     | 
| 
      
 54 
     | 
    
         
            +
                    end
         
     | 
| 
      
 55 
     | 
    
         
            +
                    texts << text
         
     | 
| 
      
 56 
     | 
    
         
            +
                  end
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
       38 
58 
     | 
    
         
             
                  def translation_units
         
     | 
| 
       39 
59 
     | 
    
         
             
                    @tmx.body.translation_units.sort_by {|tu|
         
     | 
| 
       40 
60 
     | 
    
         
             
                      -tu.variant(@lang).segment.text.length
         
     | 
| 
         @@ -7,21 +7,8 @@ module Konjak 
     | 
|
| 
       7 
7 
     | 
    
         | 
| 
       8 
8 
     | 
    
         
             
                  private
         
     | 
| 
       9 
9 
     | 
    
         | 
| 
       10 
     | 
    
         
            -
                  def  
     | 
| 
       11 
     | 
    
         
            -
                    segment 
     | 
| 
       12 
     | 
    
         
            -
                    pattern = segment.compile_pattern
         
     | 
| 
       13 
     | 
    
         
            -
             
     | 
| 
       14 
     | 
    
         
            -
                    texts = []
         
     | 
| 
       15 
     | 
    
         
            -
                    while true
         
     | 
| 
       16 
     | 
    
         
            -
                      head, match, tail = text.partition(pattern)
         
     | 
| 
       17 
     | 
    
         
            -
                      break if match.empty? || text.length < min_segment_length
         
     | 
| 
       18 
     | 
    
         
            -
                      texts << head unless head.empty?
         
     | 
| 
       19 
     | 
    
         
            -
             
     | 
| 
       20 
     | 
    
         
            -
                      texts << SegmentString.new(match, segment)
         
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
       22 
     | 
    
         
            -
                      text = tail
         
     | 
| 
       23 
     | 
    
         
            -
                    end
         
     | 
| 
       24 
     | 
    
         
            -
                    texts << text
         
     | 
| 
      
 10 
     | 
    
         
            +
                  def compile_pattern(segment)
         
     | 
| 
      
 11 
     | 
    
         
            +
                    segment.compile_pattern
         
     | 
| 
       25 
12 
     | 
    
         
             
                  end
         
     | 
| 
       26 
13 
     | 
    
         
             
                end
         
     | 
| 
       27 
14 
     | 
    
         
             
              end
         
     | 
    
        data/lib/konjak/version.rb
    CHANGED
    
    
| 
         @@ -19,8 +19,10 @@ EXPECT 
     | 
|
| 
       19 
19 
     | 
    
         | 
| 
       20 
20 
     | 
    
         
             
              context 'when blanks between words is not exactly match' do
         
     | 
| 
       21 
21 
     | 
    
         
             
                let(:doc) { <<DOC }
         
     | 
| 
       22 
     | 
    
         
            -
            this is data
         
     | 
| 
       23 
     | 
    
         
            -
             
     | 
| 
      
 22 
     | 
    
         
            +
            this is data         (with a non-standard
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
            character: ).
         
     | 
| 
       24 
26 
     | 
    
         
             
            this is data (with a non-standard character: ).
         
     | 
| 
       25 
27 
     | 
    
         
             
            DOC
         
     | 
| 
       26 
28 
     | 
    
         | 
| 
         @@ -54,7 +56,7 @@ EXPECT 
     | 
|
| 
       54 
56 
     | 
    
         
             
                context 'when blanks between words is not exactly match' do
         
     | 
| 
       55 
57 
     | 
    
         
             
                  let(:doc) { <<GTT_HTML }
         
     | 
| 
       56 
58 
     | 
    
         
             
            This is
         
     | 
| 
       57 
     | 
    
         
            -
            <a href="http://example.com">example</a>.
         
     | 
| 
      
 59 
     | 
    
         
            +
                     <a href="http://example.com">example</a>.
         
     | 
| 
       58 
60 
     | 
    
         
             
            And This
         
     | 
| 
       59 
61 
     | 
    
         
             
            is
         
     | 
| 
       60 
62 
     | 
    
         
             
            <b>example</b>. Yey.
         
     |