greeb 0.2.2.rc1 → 0.2.2.rc2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +52 -52
- data/bin/greeb +2 -2
- data/lib/greeb.rb +2 -39
- data/lib/greeb/core.rb +13 -12
- data/lib/greeb/exceptions.rb +17 -0
- data/lib/greeb/parser.rb +20 -7
- data/lib/greeb/segmentator.rb +38 -40
- data/lib/greeb/span.rb +36 -0
- data/lib/greeb/tokenizer.rb +11 -11
- data/lib/greeb/version.rb +1 -1
- data/spec/core_spec.rb +31 -33
- data/spec/parser_spec.rb +42 -30
- data/spec/segmentator_spec.rb +81 -83
- data/spec/span_spec.rb +63 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/tokenizer_spec.rb +76 -78
- metadata +5 -1
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 043ee2da87958a027caf792058ae1e3e44cc9684
         | 
| 4 | 
            +
              data.tar.gz: 8f4a99b26f706badb15fd9e9d5533dd162e090e8
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: e950167615138975bc9873a729f2486eb506692fcdaefdd3aa828590d261da0d336e04e481c652036df892da305269a047ab076622c315bd17b3c015990dcba7
         | 
| 7 | 
            +
              data.tar.gz: 7642fa3892694606792db842b0ea22a8ba13800b71a3e72eaff93d41ab0548f90a6881298aa290a61110941de350ef9941871e7497fda26b4d1e316e3688c50b
         | 
    
        data/README.md
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            # Greeb
         | 
| 2 2 | 
             
            Greeb [grʲip] is a simple yet awesome and Unicode-aware text segmentator
         | 
| 3 3 | 
             
            that is based on regular expressions. API documentation is available at
         | 
| 4 | 
            -
            < | 
| 4 | 
            +
            <http://rubydoc.info/github/dmchk/greeb/master/frames>.
         | 
| 5 5 |  | 
| 6 6 | 
             
            ## Installation
         | 
| 7 7 | 
             
            Add this line to your application's Gemfile:
         | 
| @@ -43,8 +43,8 @@ Greeb has a very convinient API that makes you happy. | |
| 43 43 | 
             
            ```ruby
         | 
| 44 44 | 
             
            pp Greeb::Tokenizer.tokenize('Hello!')
         | 
| 45 45 | 
             
            =begin
         | 
| 46 | 
            -
            [#<struct Greeb:: | 
| 47 | 
            -
             #<struct Greeb:: | 
| 46 | 
            +
            [#<struct Greeb::Span from=0, to=5, type=:letter>,
         | 
| 47 | 
            +
             #<struct Greeb::Span from=5, to=6, type=:punct>]
         | 
| 48 48 | 
             
            =end
         | 
| 49 49 | 
             
            ```
         | 
| 50 50 |  | 
| @@ -59,34 +59,34 @@ EOF | |
| 59 59 |  | 
| 60 60 | 
             
            pp Greeb::Tokenizer.tokenize(text)
         | 
| 61 61 | 
             
            =begin
         | 
| 62 | 
            -
            [#<struct Greeb:: | 
| 63 | 
            -
             #<struct Greeb:: | 
| 64 | 
            -
             #<struct Greeb:: | 
| 65 | 
            -
             #<struct Greeb:: | 
| 66 | 
            -
             #<struct Greeb:: | 
| 67 | 
            -
             #<struct Greeb:: | 
| 68 | 
            -
             #<struct Greeb:: | 
| 69 | 
            -
             #<struct Greeb:: | 
| 70 | 
            -
             #<struct Greeb:: | 
| 71 | 
            -
             #<struct Greeb:: | 
| 72 | 
            -
             #<struct Greeb:: | 
| 73 | 
            -
             #<struct Greeb:: | 
| 74 | 
            -
             #<struct Greeb:: | 
| 75 | 
            -
             #<struct Greeb:: | 
| 76 | 
            -
             #<struct Greeb:: | 
| 77 | 
            -
             #<struct Greeb:: | 
| 78 | 
            -
             #<struct Greeb:: | 
| 79 | 
            -
             #<struct Greeb:: | 
| 80 | 
            -
             #<struct Greeb:: | 
| 81 | 
            -
             #<struct Greeb:: | 
| 82 | 
            -
             #<struct Greeb:: | 
| 83 | 
            -
             #<struct Greeb:: | 
| 84 | 
            -
             #<struct Greeb:: | 
| 85 | 
            -
             #<struct Greeb:: | 
| 86 | 
            -
             #<struct Greeb:: | 
| 87 | 
            -
             #<struct Greeb:: | 
| 88 | 
            -
             #<struct Greeb:: | 
| 89 | 
            -
             #<struct Greeb:: | 
| 62 | 
            +
            [#<struct Greeb::Span from=0, to=5, type=:letter>,
         | 
| 63 | 
            +
             #<struct Greeb::Span from=5, to=6, type=:punct>,
         | 
| 64 | 
            +
             #<struct Greeb::Span from=6, to=7, type=:space>,
         | 
| 65 | 
            +
             #<struct Greeb::Span from=7, to=8, type=:letter>,
         | 
| 66 | 
            +
             #<struct Greeb::Span from=8, to=9, type=:space>,
         | 
| 67 | 
            +
             #<struct Greeb::Span from=9, to=11, type=:letter>,
         | 
| 68 | 
            +
             #<struct Greeb::Span from=11, to=12, type=:space>,
         | 
| 69 | 
            +
             #<struct Greeb::Span from=12, to=14, type=:integer>,
         | 
| 70 | 
            +
             #<struct Greeb::Span from=14, to=15, type=:punct>,
         | 
| 71 | 
            +
             #<struct Greeb::Span from=15, to=16, type=:space>,
         | 
| 72 | 
            +
             #<struct Greeb::Span from=16, to=18, type=:letter>,
         | 
| 73 | 
            +
             #<struct Greeb::Span from=18, to=19, type=:space>,
         | 
| 74 | 
            +
             #<struct Greeb::Span from=19, to=28, type=:letter>,
         | 
| 75 | 
            +
             #<struct Greeb::Span from=28, to=29, type=:space>,
         | 
| 76 | 
            +
             #<struct Greeb::Span from=29, to=35, type=:letter>,
         | 
| 77 | 
            +
             #<struct Greeb::Span from=35, to=36, type=:space>,
         | 
| 78 | 
            +
             #<struct Greeb::Span from=36, to=38, type=:letter>,
         | 
| 79 | 
            +
             #<struct Greeb::Span from=38, to=39, type=:space>,
         | 
| 80 | 
            +
             #<struct Greeb::Span from=39, to=44, type=:float>,
         | 
| 81 | 
            +
             #<struct Greeb::Span from=44, to=47, type=:punct>,
         | 
| 82 | 
            +
             #<struct Greeb::Span from=47, to=49, type=:break>,
         | 
| 83 | 
            +
             #<struct Greeb::Span from=49, to=53, type=:letter>,
         | 
| 84 | 
            +
             #<struct Greeb::Span from=53, to=54, type=:space>,
         | 
| 85 | 
            +
             #<struct Greeb::Span from=54, to=59, type=:letter>,
         | 
| 86 | 
            +
             #<struct Greeb::Span from=59, to=60, type=:space>,
         | 
| 87 | 
            +
             #<struct Greeb::Span from=60, to=63, type=:letter>,
         | 
| 88 | 
            +
             #<struct Greeb::Span from=63, to=64, type=:punct>,
         | 
| 89 | 
            +
             #<struct Greeb::Span from=64, to=65, type=:break>]
         | 
| 90 90 | 
             
            =end
         | 
| 91 91 | 
             
            ```
         | 
| 92 92 |  | 
| @@ -99,8 +99,8 @@ text = 'Hello! How are you?' | |
| 99 99 | 
             
            tokens = Greeb::Tokenizer.tokenize(text)
         | 
| 100 100 | 
             
            pp Greeb::Segmentator.new(tokens).sentences
         | 
| 101 101 | 
             
            =begin
         | 
| 102 | 
            -
            [#<struct Greeb:: | 
| 103 | 
            -
             #<struct Greeb:: | 
| 102 | 
            +
            [#<struct Greeb::Span from=0, to=6, type=:sentence>,
         | 
| 103 | 
            +
             #<struct Greeb::Span from=7, to=19, type=:sentence>]
         | 
| 104 104 | 
             
            =end
         | 
| 105 105 | 
             
            ```
         | 
| 106 106 |  | 
| @@ -113,21 +113,21 @@ tokens = Greeb::Tokenizer.tokenize(text) | |
| 113 113 | 
             
            segmentator = Greeb::Segmentator.new(tokens)
         | 
| 114 114 | 
             
            pp segmentator.extract(segmentator.sentences)
         | 
| 115 115 | 
             
            =begin
         | 
| 116 | 
            -
            {#<struct Greeb:: | 
| 117 | 
            -
              [#<struct Greeb:: | 
| 118 | 
            -
               #<struct Greeb:: | 
| 119 | 
            -
             #<struct Greeb:: | 
| 120 | 
            -
              [#<struct Greeb:: | 
| 121 | 
            -
               #<struct Greeb:: | 
| 122 | 
            -
               #<struct Greeb:: | 
| 123 | 
            -
               #<struct Greeb:: | 
| 124 | 
            -
               #<struct Greeb:: | 
| 125 | 
            -
               #<struct Greeb:: | 
| 116 | 
            +
            {#<struct Greeb::Span from=0, to=6, type=:sentence>=>
         | 
| 117 | 
            +
              [#<struct Greeb::Span from=0, to=5, type=:letter>,
         | 
| 118 | 
            +
               #<struct Greeb::Span from=5, to=6, type=:punct>],
         | 
| 119 | 
            +
             #<struct Greeb::Span from=7, to=19, type=:sentence>=>
         | 
| 120 | 
            +
              [#<struct Greeb::Span from=7, to=10, type=:letter>,
         | 
| 121 | 
            +
               #<struct Greeb::Span from=10, to=11, type=:space>,
         | 
| 122 | 
            +
               #<struct Greeb::Span from=11, to=14, type=:letter>,
         | 
| 123 | 
            +
               #<struct Greeb::Span from=14, to=15, type=:space>,
         | 
| 124 | 
            +
               #<struct Greeb::Span from=15, to=18, type=:letter>,
         | 
| 125 | 
            +
               #<struct Greeb::Span from=18, to=19, type=:punct>]}
         | 
| 126 126 | 
             
            =end
         | 
| 127 127 | 
             
            ```
         | 
| 128 128 |  | 
| 129 129 | 
             
            ### Parsing API
         | 
| 130 | 
            -
            Texts are often include some special  | 
| 130 | 
            +
            Texts are often include some special spans such as URLs and e-mail
         | 
| 131 131 | 
             
            addresses. Greeb can help you in these strings retrieval.
         | 
| 132 132 |  | 
| 133 133 | 
             
            #### URL and E-mail retrieval
         | 
| @@ -136,12 +136,12 @@ text = 'My website is http://nlpub.ru and e-mail is example@example.com.' | |
| 136 136 |  | 
| 137 137 | 
             
            pp Greeb::Parser.urls(text).map { |e| [e, text[e.from...e.to]] }
         | 
| 138 138 | 
             
            =begin
         | 
| 139 | 
            -
            [[#<struct Greeb:: | 
| 139 | 
            +
            [[#<struct Greeb::Span from=14, to=29, type=:url>, "http://nlpub.ru"]]
         | 
| 140 140 | 
             
            =end
         | 
| 141 141 |  | 
| 142 142 | 
             
            pp Greeb::Parser.emails(text).map { |e| [e, text[e.from...e.to]] }
         | 
| 143 143 | 
             
            =begin
         | 
| 144 | 
            -
            [[#<struct Greeb:: | 
| 144 | 
            +
            [[#<struct Greeb::Span from=44, to=63, type=:email>, "example@example.com"]]
         | 
| 145 145 | 
             
            =end
         | 
| 146 146 | 
             
            ```
         | 
| 147 147 |  | 
| @@ -153,7 +153,7 @@ text = 'Hello, G.L.H.F. everyone!' | |
| 153 153 |  | 
| 154 154 | 
             
            pp Greeb::Parser.abbrevs(text).map { |e| [e, text[e.from...e.to]] }
         | 
| 155 155 | 
             
            =begin
         | 
| 156 | 
            -
            [[#<struct Greeb:: | 
| 156 | 
            +
            [[#<struct Greeb::Span from=7, to=15, type=:abbrev>, "G.L.H.F."]]
         | 
| 157 157 | 
             
            =end
         | 
| 158 158 | 
             
            ```
         | 
| 159 159 |  | 
| @@ -161,13 +161,13 @@ The algorithm is not so accurate, but still useful in many practical | |
| 161 161 | 
             
            situations.
         | 
| 162 162 |  | 
| 163 163 | 
             
            ## Tokens
         | 
| 164 | 
            -
            Greeb operates with  | 
| 165 | 
            -
            *from* is a beginning of the  | 
| 166 | 
            -
            and *kind* is a type of the  | 
| 164 | 
            +
            Greeb operates with spans, tuples of *(from, to, kind)*, where
         | 
| 165 | 
            +
            *from* is a beginning of the span, *to* is an ending of the span,
         | 
| 166 | 
            +
            and *kind* is a type of the span.
         | 
| 167 167 |  | 
| 168 | 
            -
            There are several  | 
| 168 | 
            +
            There are several span types at the tokenization stage: `:letter`,
         | 
| 169 169 | 
             
            `:float`, `:integer`, `:separ`, `:punct` (for punctuation), `:spunct`
         | 
| 170 | 
            -
            (for in-sentence punctuation), and `:break`.
         | 
| 170 | 
            +
            (for in-sentence punctuation), `:space`, and `:break`.
         | 
| 171 171 |  | 
| 172 172 | 
             
            ## Contributing
         | 
| 173 173 | 
             
            1. Fork it;
         | 
    
        data/bin/greeb
    CHANGED
    
    | @@ -8,6 +8,6 @@ require 'greeb' | |
| 8 8 |  | 
| 9 9 | 
             
            text = STDIN.read.tap(&:chomp!)
         | 
| 10 10 |  | 
| 11 | 
            -
            Greeb[text].each do | | 
| 12 | 
            -
              puts text[ | 
| 11 | 
            +
            Greeb[text].each do |span|
         | 
| 12 | 
            +
              puts text[span.from...span.to] unless [:space, :break].include? span.type
         | 
| 13 13 | 
             
            end
         | 
    
        data/lib/greeb.rb
    CHANGED
    
    | @@ -1,45 +1,8 @@ | |
| 1 1 | 
             
            # encoding: utf-8
         | 
| 2 2 |  | 
| 3 3 | 
             
            require 'greeb/version'
         | 
| 4 | 
            -
             | 
| 5 | 
            -
             | 
| 6 | 
            -
            # *from* is a beginning of the entity, *to* is an ending of the entity,
         | 
| 7 | 
            -
            # and *kind* is a type of the entity.
         | 
| 8 | 
            -
            #
         | 
| 9 | 
            -
            # There are several entity types: `:letter`, `:float`, `:integer`,
         | 
| 10 | 
            -
            # `:separ` for separators, `:punct` for punctuation characters,
         | 
| 11 | 
            -
            # `:spunct` for in-sentence punctuation characters, and
         | 
| 12 | 
            -
            # `:break` for line endings.
         | 
| 13 | 
            -
            #
         | 
| 14 | 
            -
            class Greeb::Entity < Struct.new(:from, :to, :type)
         | 
| 15 | 
            -
              # @private
         | 
| 16 | 
            -
              def <=> other
         | 
| 17 | 
            -
                if (comparison = self.from <=> other.from) == 0
         | 
| 18 | 
            -
                  self.to <=> other.to
         | 
| 19 | 
            -
                else
         | 
| 20 | 
            -
                  comparison
         | 
| 21 | 
            -
                end
         | 
| 22 | 
            -
              end
         | 
| 23 | 
            -
            end
         | 
| 24 | 
            -
             | 
| 25 | 
            -
            # This runtime error appears when {Greeb::Tokenizer} or
         | 
| 26 | 
            -
            # {Greeb::Segmentator} tries to recognize unknown character.
         | 
| 27 | 
            -
            #
         | 
| 28 | 
            -
            class Greeb::UnknownEntity < RuntimeError
         | 
| 29 | 
            -
              attr_reader :text, :pos
         | 
| 30 | 
            -
             | 
| 31 | 
            -
              # @private
         | 
| 32 | 
            -
              def initialize(text, pos)
         | 
| 33 | 
            -
                @text, @pos = text, pos
         | 
| 34 | 
            -
              end
         | 
| 35 | 
            -
             | 
| 36 | 
            -
              # Generate the real error message.
         | 
| 37 | 
            -
              #
         | 
| 38 | 
            -
              def to_s
         | 
| 39 | 
            -
                'Could not recognize character "%s" @ %d' % [text[pos], pos]
         | 
| 40 | 
            -
              end
         | 
| 41 | 
            -
            end
         | 
| 42 | 
            -
             | 
| 4 | 
            +
            require 'greeb/exceptions'
         | 
| 5 | 
            +
            require 'greeb/span'
         | 
| 43 6 | 
             
            require 'greeb/strscan'
         | 
| 44 7 | 
             
            require 'greeb/tokenizer'
         | 
| 45 8 | 
             
            require 'greeb/segmentator'
         | 
    
        data/lib/greeb/core.rb
    CHANGED
    
    | @@ -13,13 +13,13 @@ module Greeb::Core | |
| 13 13 | 
             
              #
         | 
| 14 14 | 
             
              # @param text [String] input text.
         | 
| 15 15 | 
             
              #
         | 
| 16 | 
            -
              # @return [Array<Greeb:: | 
| 16 | 
            +
              # @return [Array<Greeb::Span>] a set of tokens.
         | 
| 17 17 | 
             
              #
         | 
| 18 | 
            -
              def analyze  | 
| 18 | 
            +
              def analyze(text, helpers = HELPERS)
         | 
| 19 19 | 
             
                Greeb::Tokenizer.tokenize(text).tap do |tokens|
         | 
| 20 | 
            -
                   | 
| 20 | 
            +
                  helpers.each do |helper|
         | 
| 21 21 | 
             
                    Greeb::Parser.public_send(helper, text).each do |parsed|
         | 
| 22 | 
            -
                       | 
| 22 | 
            +
                      extract_spans(tokens, parsed)
         | 
| 23 23 | 
             
                    end
         | 
| 24 24 | 
             
                  end
         | 
| 25 25 | 
             
                end
         | 
| @@ -28,17 +28,18 @@ module Greeb::Core | |
| 28 28 | 
             
              alias_method :'[]', :analyze
         | 
| 29 29 |  | 
| 30 30 | 
             
              protected
         | 
| 31 | 
            -
              # Extact  | 
| 31 | 
            +
              # Extact spans of the specified type from the input spans set.
         | 
| 32 32 | 
             
              #
         | 
| 33 | 
            -
              # @param  | 
| 34 | 
            -
              # @param  | 
| 33 | 
            +
              # @param spans [Array<Greeb::Span>] input spans set.
         | 
| 34 | 
            +
              # @param span [Greeb::Span] span to be extracted.
         | 
| 35 35 | 
             
              #
         | 
| 36 | 
            -
              # @return [Greeb:: | 
| 36 | 
            +
              # @return [Greeb::Span] span to be extracted.
         | 
| 37 37 | 
             
              #
         | 
| 38 | 
            -
              def  | 
| 39 | 
            -
                from =  | 
| 40 | 
            -
                to =  | 
| 41 | 
            -
                 | 
| 38 | 
            +
              def extract_spans(spans, span)
         | 
| 39 | 
            +
                from = spans.index { |e| e.from == span.from }
         | 
| 40 | 
            +
                to = spans.index { |e| e.to == span.to }
         | 
| 41 | 
            +
                return unless from && to
         | 
| 42 | 
            +
                spans[from..to] = span
         | 
| 42 43 | 
             
              end
         | 
| 43 44 | 
             
            end
         | 
| 44 45 |  | 
| @@ -0,0 +1,17 @@ | |
| 1 | 
            +
            # This runtime error appears when {Greeb::Tokenizer} or
         | 
| 2 | 
            +
            # {Greeb::Segmentator} tries to recognize unknown character.
         | 
| 3 | 
            +
            #
         | 
| 4 | 
            +
            class Greeb::UnknownEntity < RuntimeError
         | 
| 5 | 
            +
              attr_reader :text, :pos
         | 
| 6 | 
            +
             | 
| 7 | 
            +
              # @private
         | 
| 8 | 
            +
              def initialize(text, pos)
         | 
| 9 | 
            +
                @text, @pos = text, pos
         | 
| 10 | 
            +
              end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
              # Generate the real error message.
         | 
| 13 | 
            +
              #
         | 
| 14 | 
            +
              def to_s
         | 
| 15 | 
            +
                'Could not recognize character "%s" @ %d' % [text[pos], pos]
         | 
| 16 | 
            +
              end
         | 
| 17 | 
            +
            end
         | 
    
        data/lib/greeb/parser.rb
    CHANGED
    
    | @@ -16,12 +16,15 @@ module Greeb::Parser | |
| 16 16 | 
             
              # Another horrible pattern. Now for abbreviations.
         | 
| 17 17 | 
             
              ABBREV = /\b((-{0,1}\p{L}\.)*|(-{0,1}\p{L}\. )*)-{0,1}\p{L}\./i
         | 
| 18 18 |  | 
| 19 | 
            +
              # This pattern matches anything that looks like HTML. Or not.
         | 
| 20 | 
            +
              HTML = /<(.*?)>/i
         | 
| 21 | 
            +
             | 
| 19 22 | 
             
              # Recognize URLs in the input text. Actually, URL is obsolete standard
         | 
| 20 23 | 
             
              # and this code should be rewritten to use the URI concept.
         | 
| 21 24 | 
             
              #
         | 
| 22 25 | 
             
              # @param text [String] input text.
         | 
| 23 26 | 
             
              #
         | 
| 24 | 
            -
              # @return [Array<Greeb:: | 
| 27 | 
            +
              # @return [Array<Greeb::Span>] found URLs.
         | 
| 25 28 | 
             
              #
         | 
| 26 29 | 
             
              def urls(text)
         | 
| 27 30 | 
             
                scan(text, URL, :url)
         | 
| @@ -31,7 +34,7 @@ module Greeb::Parser | |
| 31 34 | 
             
              #
         | 
| 32 35 | 
             
              # @param text [String] input text.
         | 
| 33 36 | 
             
              #
         | 
| 34 | 
            -
              # @return [Array<Greeb:: | 
| 37 | 
            +
              # @return [Array<Greeb::Span>] found e-mail addresses.
         | 
| 35 38 | 
             
              #
         | 
| 36 39 | 
             
              def emails(text)
         | 
| 37 40 | 
             
                scan(text, EMAIL, :email)
         | 
| @@ -41,27 +44,37 @@ module Greeb::Parser | |
| 41 44 | 
             
              #
         | 
| 42 45 | 
             
              # @param text [String] input text.
         | 
| 43 46 | 
             
              #
         | 
| 44 | 
            -
              # @return [Array<Greeb:: | 
| 47 | 
            +
              # @return [Array<Greeb::Span>] found abbreviations.
         | 
| 45 48 | 
             
              #
         | 
| 46 49 | 
             
              def abbrevs(text)
         | 
| 47 50 | 
             
                scan(text, ABBREV, :abbrev)
         | 
| 48 51 | 
             
              end
         | 
| 49 52 |  | 
| 53 | 
            +
              # Recognize HTML-alike entities in the input text.
         | 
| 54 | 
            +
              #
         | 
| 55 | 
            +
              # @param text [String] input text.
         | 
| 56 | 
            +
              #
         | 
| 57 | 
            +
              # @return [Array<Greeb::Span>] found HTML entities.
         | 
| 58 | 
            +
              #
         | 
| 59 | 
            +
              def html(text)
         | 
| 60 | 
            +
                scan(text, HTML, :html)
         | 
| 61 | 
            +
              end
         | 
| 62 | 
            +
             | 
| 50 63 | 
             
              private
         | 
| 51 | 
            -
              # Implementation of regexp-based {Greeb:: | 
| 64 | 
            +
              # Implementation of regexp-based {Greeb::Span} scanner.
         | 
| 52 65 | 
             
              #
         | 
| 53 66 | 
             
              # @param text [String] input text.
         | 
| 54 67 | 
             
              # @param regexp [Regexp] regular expression to be used.
         | 
| 55 | 
            -
              # @param type [Symbol] type field for the new {Greeb:: | 
| 68 | 
            +
              # @param type [Symbol] type field for the new {Greeb::Span} instances.
         | 
| 56 69 | 
             
              # @param offset [Fixnum] offset of the next match.
         | 
| 57 70 | 
             
              #
         | 
| 58 | 
            -
              # @return [Array<Greeb:: | 
| 71 | 
            +
              # @return [Array<Greeb::Span>] found entities.
         | 
| 59 72 | 
             
              #
         | 
| 60 73 | 
             
              def scan(text, regexp, type, offset = 0)
         | 
| 61 74 | 
             
                Array.new.tap do |matches|
         | 
| 62 75 | 
             
                  while text and md = text.match(regexp)
         | 
| 63 76 | 
             
                    start, stop = md.offset(0)
         | 
| 64 | 
            -
                    matches << Greeb:: | 
| 77 | 
            +
                    matches << Greeb::Span.new(offset + start, offset + stop, type)
         | 
| 65 78 | 
             
                    text, offset = text[stop..-1], offset + stop
         | 
| 66 79 | 
             
                  end
         | 
| 67 80 | 
             
                end
         | 
    
        data/lib/greeb/segmentator.rb
    CHANGED
    
    | @@ -13,7 +13,7 @@ class Greeb::Segmentator | |
| 13 13 |  | 
| 14 14 | 
             
              # Create a new instance of {Greeb::Segmentator}.
         | 
| 15 15 | 
             
              #
         | 
| 16 | 
            -
              # @param tokens [Array<Greeb:: | 
| 16 | 
            +
              # @param tokens [Array<Greeb::Span>] tokens from [Greeb::Tokenizer].
         | 
| 17 17 | 
             
              #
         | 
| 18 18 | 
             
              def initialize(tokens)
         | 
| 19 19 | 
             
                @tokens = tokens
         | 
| @@ -21,62 +21,60 @@ class Greeb::Segmentator | |
| 21 21 |  | 
| 22 22 | 
             
              # Sentences memoization method.
         | 
| 23 23 | 
             
              #
         | 
| 24 | 
            -
              # @return [Array<Greeb:: | 
| 24 | 
            +
              # @return [Array<Greeb::Span>] a set of sentences.
         | 
| 25 25 | 
             
              #
         | 
| 26 26 | 
             
              def sentences
         | 
| 27 | 
            -
                @sentences ||=  | 
| 27 | 
            +
                @sentences ||= detect_spans(new_sentence, [:punct])
         | 
| 28 28 | 
             
              end
         | 
| 29 29 |  | 
| 30 30 | 
             
              # Subsentences memoization method.
         | 
| 31 31 | 
             
              #
         | 
| 32 | 
            -
              # @return [Array<Greeb:: | 
| 32 | 
            +
              # @return [Array<Greeb::Span>] a set of subsentences.
         | 
| 33 33 | 
             
              #
         | 
| 34 34 | 
             
              def subsentences
         | 
| 35 | 
            -
                @subsentences ||=  | 
| 35 | 
            +
                @subsentences ||= detect_spans(new_subsentence, [:punct, :spunct])
         | 
| 36 36 | 
             
              end
         | 
| 37 37 |  | 
| 38 38 | 
             
              # Extract tokens from the set of sentences.
         | 
| 39 39 | 
             
              #
         | 
| 40 | 
            -
              # @param sentences [Array<Greeb:: | 
| 40 | 
            +
              # @param sentences [Array<Greeb::Span>] a list of sentences.
         | 
| 41 41 | 
             
              #
         | 
| 42 | 
            -
              # @return [ | 
| 42 | 
            +
              # @return [Array<Greeb::Span, Array<Greeb::Span>>] a hash with
         | 
| 43 43 | 
             
              #   sentences as keys and tokens arrays as values.
         | 
| 44 44 | 
             
              #
         | 
| 45 45 | 
             
              def extract(sentences, collection = tokens)
         | 
| 46 | 
            -
                 | 
| 47 | 
            -
                   | 
| 48 | 
            -
             | 
| 49 | 
            -
                  end
         | 
| 50 | 
            -
                ]
         | 
| 46 | 
            +
                sentences.map do |s|
         | 
| 47 | 
            +
                  [s, collection.select { |t| t.from >= s.from and t.to <= s.to }]
         | 
| 48 | 
            +
                end
         | 
| 51 49 | 
             
              end
         | 
| 52 50 |  | 
| 53 51 | 
             
              protected
         | 
| 54 | 
            -
              # Implementation of the  | 
| 52 | 
            +
              # Implementation of the span detection method.
         | 
| 55 53 | 
             
              #
         | 
| 56 | 
            -
              # @param sample [Greeb:: | 
| 54 | 
            +
              # @param sample [Greeb::Span] a sample of span to be cloned in the
         | 
| 57 55 | 
             
              # process.
         | 
| 58 56 | 
             
              # @param stop_marks [Array<Symbol>] an array that stores the
         | 
| 59 | 
            -
              # correspondent stop marks of the necessary  | 
| 57 | 
            +
              # correspondent stop marks of the necessary spans.
         | 
| 60 58 | 
             
              #
         | 
| 61 | 
            -
              # @return [Array<Greeb:: | 
| 59 | 
            +
              # @return [Array<Greeb::Span>] a set of entites.
         | 
| 62 60 | 
             
              #
         | 
| 63 | 
            -
              def  | 
| 61 | 
            +
              def detect_spans(sample, stop_marks)
         | 
| 64 62 | 
             
                collection = []
         | 
| 65 63 |  | 
| 66 | 
            -
                rest = tokens.inject(sample.dup) do | | 
| 67 | 
            -
                  next  | 
| 68 | 
            -
                   | 
| 69 | 
            -
                  next  | 
| 64 | 
            +
                rest = tokens.inject(sample.dup) do |span, token|
         | 
| 65 | 
            +
                  next span if sentence_aint_start? span, token
         | 
| 66 | 
            +
                  span.from = token.from unless span.from
         | 
| 67 | 
            +
                  next span if span.to and span.to > token.to
         | 
| 70 68 |  | 
| 71 69 | 
             
                  if stop_marks.include? token.type
         | 
| 72 | 
            -
                     | 
| 73 | 
            -
                    collection <<  | 
| 74 | 
            -
                     | 
| 70 | 
            +
                    span.to = find_forward(tokens, token).to
         | 
| 71 | 
            +
                    collection << span
         | 
| 72 | 
            +
                    span = sample.dup
         | 
| 75 73 | 
             
                  elsif ![:separ, :space].include? token.type
         | 
| 76 | 
            -
                     | 
| 74 | 
            +
                    span.to = token.to
         | 
| 77 75 | 
             
                  end
         | 
| 78 76 |  | 
| 79 | 
            -
                   | 
| 77 | 
            +
                  span
         | 
| 80 78 | 
             
                end
         | 
| 81 79 |  | 
| 82 80 | 
             
                if rest.from && rest.to
         | 
| @@ -88,42 +86,42 @@ class Greeb::Segmentator | |
| 88 86 |  | 
| 89 87 | 
             
              private
         | 
| 90 88 | 
             
              # Check the possibility of starting a new sentence by the specified
         | 
| 91 | 
            -
              # pair of  | 
| 89 | 
            +
              # pair of span and token.
         | 
| 92 90 | 
             
              #
         | 
| 93 | 
            -
              # @param  | 
| 94 | 
            -
              # @param token [Greeb:: | 
| 91 | 
            +
              # @param span [Greeb::Span] an span to be checked.
         | 
| 92 | 
            +
              # @param token [Greeb::Span] an token to be checked.
         | 
| 95 93 | 
             
              #
         | 
| 96 94 | 
             
              # @return true or false.
         | 
| 97 95 | 
             
              #
         | 
| 98 | 
            -
              def sentence_aint_start?( | 
| 99 | 
            -
                ! | 
| 96 | 
            +
              def sentence_aint_start?(span, token)
         | 
| 97 | 
            +
                !span.from and SENTENCE_AINT_START.include? token.type
         | 
| 100 98 | 
             
              end
         | 
| 101 99 |  | 
| 102 100 | 
             
              # Find a forwarding token that has another type.
         | 
| 103 101 | 
             
              #
         | 
| 104 | 
            -
              # @param collection [Array<Greeb:: | 
| 105 | 
            -
              # @param sample [Greeb:: | 
| 102 | 
            +
              # @param collection [Array<Greeb::Span>] array of possible tokens.
         | 
| 103 | 
            +
              # @param sample [Greeb::Span] a token that is treated as a sample.
         | 
| 106 104 | 
             
              #
         | 
| 107 | 
            -
              # @return [Greeb:: | 
| 105 | 
            +
              # @return [Greeb::Span] a forwarding token.
         | 
| 108 106 | 
             
              #
         | 
| 109 107 | 
             
              def find_forward(collection, sample)
         | 
| 110 108 | 
             
                collection.select { |t| t.from >= sample.from }.
         | 
| 111 109 | 
             
                  inject(sample) { |r, t| t.type == sample.type ? t : (break r) }
         | 
| 112 110 | 
             
              end
         | 
| 113 111 |  | 
| 114 | 
            -
              # Create a new instance of {Greeb:: | 
| 112 | 
            +
              # Create a new instance of {Greeb::Span} with `:sentence` type.
         | 
| 115 113 | 
             
              #
         | 
| 116 | 
            -
              # @return [Greeb:: | 
| 114 | 
            +
              # @return [Greeb::Span] a new span instance.
         | 
| 117 115 | 
             
              #
         | 
| 118 116 | 
             
              def new_sentence
         | 
| 119 | 
            -
                Greeb:: | 
| 117 | 
            +
                Greeb::Span.new(nil, nil, :sentence)
         | 
| 120 118 | 
             
              end
         | 
| 121 119 |  | 
| 122 | 
            -
              # Create a new instance of {Greeb:: | 
| 120 | 
            +
              # Create a new instance of {Greeb::Span} with `:subsentence` type.
         | 
| 123 121 | 
             
              #
         | 
| 124 | 
            -
              # @return [Greeb:: | 
| 122 | 
            +
              # @return [Greeb::Span] a new span instance.
         | 
| 125 123 | 
             
              #
         | 
| 126 124 | 
             
              def new_subsentence
         | 
| 127 | 
            -
                Greeb:: | 
| 125 | 
            +
                Greeb::Span.new(nil, nil, :subsentence)
         | 
| 128 126 | 
             
              end
         | 
| 129 127 | 
             
            end
         |