rbbt-text 0.6.2 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/bow/dictionary.rb +1 -0
- data/lib/rbbt/corpus/document.rb +5 -2
- data/lib/rbbt/corpus/document_repo.rb +2 -1
- data/lib/rbbt/entity/document.rb +40 -0
- data/lib/rbbt/ner/segment.rb +9 -2
- data/lib/rbbt/ner/segment/named_entity.rb +4 -2
- data/lib/rbbt/ner/segment/token.rb +28 -3
- data/lib/rbbt/ner/segment/transformed.rb +116 -115
- data/lib/rbbt/ner/token_trieNER.rb +17 -13
- data/test/rbbt/corpus/test_document.rb +22 -10
- data/test/rbbt/ner/segment/test_named_entity.rb +1 -1
- data/test/rbbt/ner/segment/test_transformed.rb +38 -3
- metadata +6 -7
    
        data/lib/rbbt/bow/dictionary.rb
    CHANGED
    
    
    
        data/lib/rbbt/corpus/document.rb
    CHANGED
    
    | @@ -148,7 +148,9 @@ class Document | |
| 148 148 | 
             
                    fields = data.fields if fields.nil? and data.respond_to? :fields
         | 
| 149 149 |  | 
| 150 150 |  | 
| 151 | 
            -
                    data. | 
| 151 | 
            +
                    if data.respond_to? :persistence_path and String === data.persistence_path
         | 
| 152 | 
            +
                      data.filter(data.persistence_path + '.filters')
         | 
| 153 | 
            +
                    end
         | 
| 152 154 | 
             
                    data.add_filter("field:#{ doc_field }", @docid)
         | 
| 153 155 | 
             
                    data.add_filter("field:#{ entity_field }", "#{ entity }")
         | 
| 154 156 | 
             
                    keys = data.keys
         | 
| @@ -157,7 +159,7 @@ class Document | |
| 157 159 |  | 
| 158 160 | 
             
                    if keys.empty?
         | 
| 159 161 | 
             
                      segments = produce_#{entity}
         | 
| 160 | 
            -
                      segments << Segment.setup("No #{entity} found in document  | 
| 162 | 
            +
                      segments << Segment.setup("No #{entity} found in document " + @docid.to_s, -1) if segments.empty?
         | 
| 161 163 | 
             
                      tsv = Segment.tsv(segments, *fields.reject{|f| ["#{doc_field}", "#{entity_field}", "Start", "End", "annotation_types"].include? f})
         | 
| 162 164 |  | 
| 163 165 | 
             
                      tsv.add_field "#{ doc_field }" do
         | 
| @@ -178,6 +180,7 @@ class Document | |
| 178 180 | 
             
                      data.pop_filter
         | 
| 179 181 | 
             
                      data.pop_filter
         | 
| 180 182 | 
             
                      data.read
         | 
| 183 | 
            +
             | 
| 181 184 | 
             
                    else
         | 
| 182 185 | 
             
                      if raw == :check
         | 
| 183 186 | 
             
                        data.close
         | 
| @@ -0,0 +1,40 @@ | |
| 1 | 
            +
            require 'rbbt/entity'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Document
         | 
| 4 | 
            +
              extend Entity
         | 
| 5 | 
            +
             | 
| 6 | 
            +
              class << self
         | 
| 7 | 
            +
                attr_accessor :corpus
         | 
| 8 | 
            +
              end
         | 
| 9 | 
            +
             | 
| 10 | 
            +
              property :text => :array2single do
         | 
| 11 | 
            +
                article_text = {}
         | 
| 12 | 
            +
                missing = []
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                self.each do |doc|
         | 
| 15 | 
            +
                  Document.corpus.read if Document.corpus.respond_to? :read
         | 
| 16 | 
            +
                  if Document.corpus.include?(doc) 
         | 
| 17 | 
            +
                    article_text[doc] =  Document.corpus[doc] 
         | 
| 18 | 
            +
                  else
         | 
| 19 | 
            +
                    missing << doc
         | 
| 20 | 
            +
                  end
         | 
| 21 | 
            +
                end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                if missing.any?
         | 
| 24 | 
            +
                  missing.first.annotate missing
         | 
| 25 | 
            +
                  missing_text = Misc.process_to_hash(missing){|list| list._get_text}
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                  Misc.lock Document.corpus.persistence_path do
         | 
| 28 | 
            +
                    Document.corpus.write if Document.corpus.respond_to? :write
         | 
| 29 | 
            +
                    missing_text.each do |doc, text|
         | 
| 30 | 
            +
                      article_text[doc] = text
         | 
| 31 | 
            +
                      Document.corpus[doc] = text
         | 
| 32 | 
            +
                    end
         | 
| 33 | 
            +
                    Document.corpus.read if Document.corpus.respond_to? :read
         | 
| 34 | 
            +
                  end
         | 
| 35 | 
            +
                end
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                article_text.values_at *self
         | 
| 38 | 
            +
              end
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            end
         | 
    
        data/lib/rbbt/ner/segment.rb
    CHANGED
    
    | @@ -5,6 +5,10 @@ module Segment | |
| 5 5 | 
             
              extend Annotation
         | 
| 6 6 | 
             
              self.annotation :offset
         | 
| 7 7 |  | 
| 8 | 
            +
              def offset=(offset)
         | 
| 9 | 
            +
                @offset = offset.nil? ? nil : offset.to_i
         | 
| 10 | 
            +
              end
         | 
| 11 | 
            +
             | 
| 8 12 | 
             
              #{{{ Ranges
         | 
| 9 13 |  | 
| 10 14 | 
             
              def end
         | 
| @@ -297,8 +301,11 @@ module Segment | |
| 297 301 | 
             
              end
         | 
| 298 302 |  | 
| 299 303 | 
             
              def self.load_tsv(tsv)
         | 
| 300 | 
            -
                tsv. | 
| 301 | 
            -
             | 
| 304 | 
            +
                fields = tsv.fields
         | 
| 305 | 
            +
                tsv.with_unnamed do
         | 
| 306 | 
            +
                  tsv.collect do |id, values|
         | 
| 307 | 
            +
                    Annotated.load_tsv_values(id, values, fields)
         | 
| 308 | 
            +
                  end
         | 
| 302 309 | 
             
                end
         | 
| 303 310 | 
             
              end
         | 
| 304 311 |  | 
| @@ -2,9 +2,34 @@ require 'rbbt/annotations' | |
| 2 2 | 
             
            require 'rbbt/ner/segment'
         | 
| 3 3 |  | 
| 4 4 | 
             
            module Token
         | 
| 5 | 
            -
               | 
| 6 | 
            -
               | 
| 7 | 
            -
              self. | 
| 5 | 
            +
              attr_accessor :offset, :original
         | 
| 6 | 
            +
              
         | 
| 7 | 
            +
              def self.all_annotations
         | 
| 8 | 
            +
                [:offset, :original]
         | 
| 9 | 
            +
              end
         | 
| 10 | 
            +
             | 
| 11 | 
            +
              def self.setup(text, start, original = nil)
         | 
| 12 | 
            +
                text.extend Token
         | 
| 13 | 
            +
                text.offset = start
         | 
| 14 | 
            +
                text.original = original
         | 
| 15 | 
            +
                text
         | 
| 16 | 
            +
              end
         | 
| 17 | 
            +
              
         | 
| 18 | 
            +
              def info
         | 
| 19 | 
            +
                {:original => original, :offset => offset}
         | 
| 20 | 
            +
              end
         | 
| 21 | 
            +
             | 
| 22 | 
            +
              def id
         | 
| 23 | 
            +
                Misc.hash2md5 info.merge :self => self
         | 
| 24 | 
            +
              end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
              def end
         | 
| 27 | 
            +
                offset + self.length - 1
         | 
| 28 | 
            +
              end
         | 
| 29 | 
            +
             | 
| 30 | 
            +
              def range
         | 
| 31 | 
            +
                (offset..self.end)
         | 
| 32 | 
            +
              end
         | 
| 8 33 |  | 
| 9 34 | 
             
              def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
         | 
| 10 35 |  | 
| @@ -1,9 +1,9 @@ | |
| 1 | 
            +
            require 'rbbt/util/misc'
         | 
| 1 2 | 
             
            require 'rbbt/ner/segment'
         | 
| 3 | 
            +
             | 
| 2 4 | 
             
            module Transformed
         | 
| 3 | 
            -
              attr_accessor :transformation_offset_differences, :transformation_original
         | 
| 4 5 |  | 
| 5 6 | 
             
              def self.transform(text, segments, replacement = nil, &block)
         | 
| 6 | 
            -
                require 'rbbt/util/misc'
         | 
| 7 7 |  | 
| 8 8 | 
             
                text.extend Transformed
         | 
| 9 9 | 
             
                text.replace(segments, replacement, &block)
         | 
| @@ -12,7 +12,6 @@ module Transformed | |
| 12 12 | 
             
              end
         | 
| 13 13 |  | 
| 14 14 | 
             
              def self.with_transform(text, segments, replacement)
         | 
| 15 | 
            -
                require 'rbbt/util/misc'
         | 
| 16 15 |  | 
| 17 16 | 
             
                text.extend Transformed
         | 
| 18 17 | 
             
                text.replace(segments, replacement)
         | 
| @@ -24,147 +23,149 @@ module Transformed | |
| 24 23 | 
             
                text.restore(segments, true)
         | 
| 25 24 | 
             
              end
         | 
| 26 25 |  | 
| 27 | 
            -
               | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 30 | 
            -
                 | 
| 31 | 
            -
                 | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 26 | 
            +
              attr_accessor :transformed_segments, :transformation_stack
         | 
| 27 | 
            +
             
         | 
| 28 | 
            +
              def shift(segment_o)
         | 
| 29 | 
            +
                begin_shift = 0
         | 
| 30 | 
            +
                end_shift = 0
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                @transformed_segments.sort_by{|id, info| info.last}.each{|id,info| 
         | 
| 33 | 
            +
                  pseg_o, diff = info
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                  case
         | 
| 36 | 
            +
                    # Before
         | 
| 37 | 
            +
                  when segment_o.last + end_shift < pseg_o.begin
         | 
| 38 | 
            +
                    # After
         | 
| 39 | 
            +
                  when (segment_o.begin + begin_shift > pseg_o.last)
         | 
| 40 | 
            +
                    begin_shift += diff
         | 
| 41 | 
            +
                    end_shift += diff
         | 
| 42 | 
            +
                    # Includes
         | 
| 43 | 
            +
                  when (segment_o.begin + begin_shift <= pseg_o.begin and segment_o.last + end_shift >= pseg_o.last)
         | 
| 44 | 
            +
                    end_shift += diff
         | 
| 45 | 
            +
                    # Inside
         | 
| 46 | 
            +
                  when (segment_o.begin + begin_shift >= pseg_o.begin and segment_o.last + end_shift <= pseg_o.last)
         | 
| 47 | 
            +
                    return nil
         | 
| 48 | 
            +
                    # Overlaps start
         | 
| 49 | 
            +
                  when (segment_o.begin + begin_shift <= pseg_o.begin and segment_o.last + end_shift <= pseg_o.last)
         | 
| 50 | 
            +
                    return nil
         | 
| 51 | 
            +
                    # Overlaps end
         | 
| 52 | 
            +
                  when (segment_o.begin + begin_shift >= pseg_o.begin and segment_o.last + end_shift >= pseg_o.last)
         | 
| 53 | 
            +
                    return nil
         | 
| 54 | 
            +
                 else
         | 
| 55 | 
            +
                    raise "Unknown overlaps: #{segment_o.inspect} - #{pseg_o.inspect}"
         | 
| 36 56 | 
             
                  end
         | 
| 37 | 
            -
             | 
| 38 | 
            -
                end
         | 
| 57 | 
            +
                }
         | 
| 39 58 |  | 
| 40 | 
            -
                 | 
| 59 | 
            +
                [begin_shift, end_shift]
         | 
| 41 60 | 
             
              end
         | 
| 42 61 |  | 
| 43 | 
            -
              def  | 
| 44 | 
            -
                 | 
| 62 | 
            +
              def self.sort(segments)
         | 
| 63 | 
            +
                segments.compact.sort do |a,b|
         | 
| 64 | 
            +
                  case
         | 
| 65 | 
            +
                  when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
         | 
| 66 | 
            +
                    0
         | 
| 67 | 
            +
                  when (a.nil? or a.offset.nil?)
         | 
| 68 | 
            +
                    -1
         | 
| 69 | 
            +
                  when (b.nil? or b.offset.nil?)
         | 
| 70 | 
            +
                    +1
         | 
| 71 | 
            +
                    # Non-overlap
         | 
| 72 | 
            +
                  when (a.end < b.offset or b.end < a.offset)
         | 
| 73 | 
            +
                    b.offset <=> a.offset
         | 
| 74 | 
            +
                    # b includes a
         | 
| 75 | 
            +
                  when (a.offset >= b.offset and a.end <= b.end)
         | 
| 76 | 
            +
                    -1
         | 
| 77 | 
            +
                    # b includes a
         | 
| 78 | 
            +
                  when (b.offset >= a.offset and b.end <= a.end)
         | 
| 79 | 
            +
                    +1
         | 
| 80 | 
            +
                    # Overlap
         | 
| 81 | 
            +
                  when (a.offset > b.offset and a.end > b.end or b.offset < a.offset and b.end > a.end)
         | 
| 82 | 
            +
                    a.length <=> b.length
         | 
| 83 | 
            +
                  else
         | 
| 84 | 
            +
                    raise "Unexpected case in sort: #{a.range} - #{b.range}"
         | 
| 85 | 
            +
                  end
         | 
| 86 | 
            +
                end
         | 
| 45 87 | 
             
              end
         | 
| 46 88 |  | 
| 47 | 
            -
              def  | 
| 48 | 
            -
                 | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
                            when Integer === pos
         | 
| 52 | 
            -
                              transform_pos(pos)
         | 
| 53 | 
            -
                            else
         | 
| 54 | 
            -
                              raise "Text position not understood '#{pos.inspect}'. Not Range or Integer"
         | 
| 55 | 
            -
                            end
         | 
| 56 | 
            -
             | 
| 57 | 
            -
                self[transformed_pos] = value
         | 
| 58 | 
            -
              end
         | 
| 89 | 
            +
              def replace(segments, replacement = nil, &block)
         | 
| 90 | 
            +
                @transformed_segments ||= {}
         | 
| 91 | 
            +
                @transformation_stack ||= []
         | 
| 92 | 
            +
                stack = []
         | 
| 59 93 |  | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
                              transform_range(pos)
         | 
| 64 | 
            -
                            when Integer === pos
         | 
| 65 | 
            -
                              transform_pos(pos)
         | 
| 66 | 
            -
                            else
         | 
| 67 | 
            -
                              raise "Text position not understood '#{pos.inspect}'. Not Range or Integer"
         | 
| 68 | 
            -
                            end
         | 
| 69 | 
            -
             | 
| 70 | 
            -
                self[transformed_pos]
         | 
| 71 | 
            -
              end
         | 
| 94 | 
            +
                Transformed.sort(segments).each do |segment|
         | 
| 95 | 
            +
                  next if segment.offset.nil?
         | 
| 96 | 
            +
                  shift = shift segment.range
         | 
| 72 97 |  | 
| 73 | 
            -
             | 
| 74 | 
            -
                return false if @transformation_offset_differences.nil? or @transformation_offset_differences.empty?
         | 
| 75 | 
            -
                transformation_offset_difference = @transformation_offset_differences.last
         | 
| 98 | 
            +
                  next if shift.nil?
         | 
| 76 99 |  | 
| 77 | 
            -
             | 
| 78 | 
            -
                  offset, diff, orig_length, trans_length = info
         | 
| 79 | 
            -
                  return true if segment_range.begin > offset and segment_range.begin < offset + trans_length or
         | 
| 80 | 
            -
                  segment_range.end   > offset and segment_range.end   < offset + trans_length
         | 
| 81 | 
            -
                end
         | 
| 100 | 
            +
                  shift_begin, shift_end = shift
         | 
| 82 101 |  | 
| 83 | 
            -
             | 
| 84 | 
            -
             | 
| 102 | 
            +
                  text_offset = self.respond_to?(:offset)? self.offset : 0
         | 
| 103 | 
            +
                  updated_begin = segment.offset + shift_begin - text_offset
         | 
| 104 | 
            +
                  updated_end   = segment.range.last + shift_end - text_offset
         | 
| 85 105 |  | 
| 86 | 
            -
             | 
| 87 | 
            -
                replacement ||= block
         | 
| 88 | 
            -
                raise "No replacement given" if replacement.nil?
         | 
| 89 | 
            -
                transformation_offset_differences = []
         | 
| 90 | 
            -
                transformation_original = []
         | 
| 106 | 
            +
                  updated_range = (updated_begin..updated_end)
         | 
| 91 107 |  | 
| 92 | 
            -
             | 
| 93 | 
            -
                  untransformed_segment_range_here= segment.range_in(self)
         | 
| 94 | 
            -
                  transformed_segment_range  = self.transform_range(untransformed_segment_range_here)
         | 
| 95 | 
            -
                  next if conflict?(transformed_segment_range)
         | 
| 108 | 
            +
                  updated_text = self[updated_begin..updated_end]
         | 
| 96 109 |  | 
| 97 | 
            -
                   | 
| 110 | 
            +
                  original_text = segment.dup
         | 
| 111 | 
            +
                  segment.replace updated_text
         | 
| 98 112 |  | 
| 99 113 | 
             
                  case
         | 
| 114 | 
            +
                  when block_given?
         | 
| 115 | 
            +
                    new =  block.call(segment)
         | 
| 100 116 | 
             
                  when String === replacement
         | 
| 101 | 
            -
                     | 
| 117 | 
            +
                    new = replacement
         | 
| 102 118 | 
             
                  when Proc === replacement
         | 
| 119 | 
            +
                    new = replacement.call(segment)
         | 
| 120 | 
            +
                  end
         | 
| 103 121 |  | 
| 104 | 
            -
             | 
| 105 | 
            -
                    save_segment_text = segment.dup
         | 
| 106 | 
            -
                    save_offset = segment.offset
         | 
| 107 | 
            -
                    segment.replace text_before_transform
         | 
| 108 | 
            -
                    segment.offset = transformed_segment_range.begin
         | 
| 122 | 
            +
                  diff = new.length - segment.length
         | 
| 109 123 |  | 
| 110 | 
            -
             | 
| 124 | 
            +
                  self[updated_begin..updated_end] = new
         | 
| 111 125 |  | 
| 112 | 
            -
             | 
| 113 | 
            -
                    segment.replace save_segment_text
         | 
| 114 | 
            -
                    segment.offset = save_offset
         | 
| 115 | 
            -
                  else
         | 
| 116 | 
            -
                    raise "Replacemente not String nor Proc"
         | 
| 117 | 
            -
                  end
         | 
| 118 | 
            -
                  diff = segment.length - transformed_text.length
         | 
| 119 | 
            -
                  self[transformed_segment_range] = transformed_text
         | 
| 126 | 
            +
                  @transformed_segments[segment.object_id] = [segment.range, diff, updated_text, updated_range, @transformed_segments.size]
         | 
| 120 127 |  | 
| 121 | 
            -
                   | 
| 122 | 
            -
                   | 
| 128 | 
            +
                  segment.replace original_text
         | 
| 129 | 
            +
                  stack << segment.object_id
         | 
| 123 130 | 
             
                end
         | 
| 131 | 
            +
                @transformation_stack << stack
         | 
| 132 | 
            +
              end
         | 
| 124 133 |  | 
| 125 | 
            -
             | 
| 126 | 
            -
                 | 
| 127 | 
            -
             | 
| 128 | 
            -
                 | 
| 134 | 
            +
              def fix_segment(segment, range, diff)
         | 
| 135 | 
            +
                case
         | 
| 136 | 
            +
                  # Before
         | 
| 137 | 
            +
                when segment.end < range.begin
         | 
| 138 | 
            +
                  # After
         | 
| 139 | 
            +
                when segment.offset > range.end + diff
         | 
| 140 | 
            +
                  segment.offset -= diff
         | 
| 141 | 
            +
                  # Includes
         | 
| 142 | 
            +
                when (segment.offset <= range.begin and segment.end >= range.end + diff)
         | 
| 143 | 
            +
                  segment.replace self[segment.offset..segment.end - diff]
         | 
| 144 | 
            +
                else
         | 
| 145 | 
            +
                  raise "Segment Overlaps"
         | 
| 146 | 
            +
                end
         | 
| 129 147 | 
             
              end
         | 
| 130 148 |  | 
| 131 | 
            -
              def restore(segments | 
| 132 | 
            -
                 | 
| 133 | 
            -
                while self.transformation_offset_differences.any? and not stop
         | 
| 134 | 
            -
                  transformation_offset_differences = self.transformation_offset_differences.pop
         | 
| 135 | 
            -
                  transformation_original           = self.transformation_original.pop
         | 
| 149 | 
            +
              def restore(segments, first_only = false)
         | 
| 150 | 
            +
                return segments if @transformation_stack.empty?
         | 
| 136 151 |  | 
| 137 | 
            -
             | 
| 138 | 
            -
             | 
| 139 | 
            -
             | 
| 152 | 
            +
                if first_only
         | 
| 153 | 
            +
                  @transformation_stack.pop.reverse.each do |id|
         | 
| 154 | 
            +
                    orig_range, diff, text, range = @transformed_segments.delete id
         | 
| 140 155 |  | 
| 141 | 
            -
             | 
| 142 | 
            -
                    self | 
| 156 | 
            +
                    new_range = (range.begin..range.last + diff)
         | 
| 157 | 
            +
                    self[new_range] = text
         | 
| 158 | 
            +
                    segments.each do |segment|
         | 
| 159 | 
            +
                      next unless Segment === segment
         | 
| 160 | 
            +
                      fix_segment(segment, range, diff)
         | 
| 161 | 
            +
                    end if Array === segments
         | 
| 143 162 | 
             
                  end
         | 
| 144 | 
            -
             | 
| 145 | 
            -
             | 
| 146 | 
            -
             | 
| 147 | 
            -
             | 
| 148 | 
            -
             | 
| 149 | 
            -
                  segment_ranges = segments.each do |segment|
         | 
| 150 | 
            -
                    r = segment.range
         | 
| 151 | 
            -
             | 
| 152 | 
            -
                    s = r.begin
         | 
| 153 | 
            -
                    e = r.end
         | 
| 154 | 
            -
                    sdiff = 0
         | 
| 155 | 
            -
                    ediff = 0
         | 
| 156 | 
            -
                    transformation_offset_differences.reverse.each do |offset,diff,orig_length,rep_length|
         | 
| 157 | 
            -
                      sdiff += diff if offset < s
         | 
| 158 | 
            -
                      ediff += diff if offset + rep_length - 1 < e
         | 
| 159 | 
            -
                    end
         | 
| 160 | 
            -
             | 
| 161 | 
            -
                    segment.offset = s + sdiff
         | 
| 162 | 
            -
                    segment.replace self[(s+sdiff)..(e + ediff)]
         | 
| 163 | 
            +
                  segments
         | 
| 164 | 
            +
                else
         | 
| 165 | 
            +
                  while @transformation_stack.any?
         | 
| 166 | 
            +
                    restore(segments, true)
         | 
| 163 167 | 
             
                  end
         | 
| 168 | 
            +
                  segments
         | 
| 164 169 | 
             
                end
         | 
| 165 | 
            -
             | 
| 166 | 
            -
                segments
         | 
| 167 170 | 
             
              end
         | 
| 168 171 | 
             
            end
         | 
| 169 | 
            -
             | 
| 170 | 
            -
             | 
| @@ -110,7 +110,7 @@ class TokenTrieNER < NER | |
| 110 110 | 
             
              end
         | 
| 111 111 |  | 
| 112 112 | 
             
              def self.merge(index1, index2)
         | 
| 113 | 
            -
                index1.write if index1.respond_to? :write
         | 
| 113 | 
            +
                index1.write if index1.respond_to? :write and not index1.write?
         | 
| 114 114 | 
             
                index2.each do |key, new_index2|
         | 
| 115 115 | 
             
                  case
         | 
| 116 116 | 
             
                  when key == :END
         | 
| @@ -119,7 +119,8 @@ class TokenTrieNER < NER | |
| 119 119 | 
             
                    end1.uniq!
         | 
| 120 120 | 
             
                    index1[:END] = end1
         | 
| 121 121 | 
             
                  when index1.include?(key)
         | 
| 122 | 
            -
                     | 
| 122 | 
            +
                    new = merge(index1[key], new_index2)
         | 
| 123 | 
            +
                    index1[key] = new
         | 
| 123 124 | 
             
                  else
         | 
| 124 125 | 
             
                    index1[key] = new_index2
         | 
| 125 126 | 
             
                  end
         | 
| @@ -148,7 +149,10 @@ class TokenTrieNER < NER | |
| 148 149 | 
             
                    tokens = Array === name ? name : tokenize(name, false, split_at, no_clean) 
         | 
| 149 150 | 
             
                    tokens.extend EnumeratedArray
         | 
| 150 151 |  | 
| 151 | 
            -
                     | 
| 152 | 
            +
                    token_index = index_for_tokens(tokens, code, type, slack)
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                    tmp_index = merge(tmp_index, token_index) unless tokens.empty?
         | 
| 155 | 
            +
             | 
| 152 156 | 
             
                    items_in_chunk += 1
         | 
| 153 157 |  | 
| 154 158 | 
             
                    if items_in_chunk > chunk_size
         | 
| @@ -267,22 +271,22 @@ class TokenTrieNER < NER | |
| 267 271 | 
             
                  TokenTrieNER.merge(@index, new.index)
         | 
| 268 272 | 
             
                when TSV === new
         | 
| 269 273 | 
             
                  Log.debug "TokenTrieNER merging TSV"
         | 
| 270 | 
            -
                   | 
| 271 | 
            -
             | 
| 272 | 
            -
             | 
| 273 | 
            -
             | 
| 274 | 
            -
                   | 
| 275 | 
            -
                  new.unnamed = old_unnamed
         | 
| 276 | 
            -
                  new.monitor = old_monitor
         | 
| 274 | 
            +
                  new.with_unnamed do
         | 
| 275 | 
            +
                    new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
         | 
| 276 | 
            +
                      TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
         | 
| 277 | 
            +
                    end
         | 
| 278 | 
            +
                  end
         | 
| 277 279 | 
             
                when Hash === new
         | 
| 278 280 | 
             
                  Log.debug "TokenTrieNER merging Hash"
         | 
| 279 281 | 
             
                  TokenTrieNER.merge(@index, new)
         | 
| 280 282 | 
             
                when String === new
         | 
| 281 283 | 
             
                  Log.debug "TokenTrieNER merging file: #{ new }"
         | 
| 282 284 | 
             
                  new = TSV.open(new, :flat)
         | 
| 283 | 
            -
                  new. | 
| 284 | 
            -
             | 
| 285 | 
            -
             | 
| 285 | 
            +
                  new.with_unnamed do
         | 
| 286 | 
            +
                    new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
         | 
| 287 | 
            +
                      TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
         | 
| 288 | 
            +
                    end
         | 
| 289 | 
            +
                  end
         | 
| 286 290 | 
             
                end
         | 
| 287 291 | 
             
              end
         | 
| 288 292 |  | 
| @@ -2,7 +2,19 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.r | |
| 2 2 | 
             
            require 'rbbt/corpus/document'
         | 
| 3 3 | 
             
            require 'test/unit'
         | 
| 4 4 |  | 
| 5 | 
            +
            module TokenEntity
         | 
| 6 | 
            +
              extend Annotation
         | 
| 7 | 
            +
              include Segment
         | 
| 8 | 
            +
              self.annotation :original
         | 
| 9 | 
            +
            end
         | 
| 5 10 | 
             
            class Document
         | 
| 11 | 
            +
             | 
| 12 | 
            +
              def tokenize(text)
         | 
| 13 | 
            +
                Token.tokenize(text).collect do |token|
         | 
| 14 | 
            +
                  TokenEntity.setup(token.dup, token.offset, token.original)
         | 
| 15 | 
            +
                end
         | 
| 16 | 
            +
              end
         | 
| 17 | 
            +
             | 
| 6 18 | 
             
              define :sentences do 
         | 
| 7 19 | 
             
                require 'rbbt/nlp/nlp'
         | 
| 8 20 | 
             
                NLP.geniass_sentence_splitter(text)
         | 
| @@ -10,22 +22,22 @@ class Document | |
| 10 22 |  | 
| 11 23 | 
             
              define :tokens do
         | 
| 12 24 | 
             
                require 'rbbt/ner/segment/token'
         | 
| 13 | 
            -
                 | 
| 25 | 
            +
                tokenize(text)
         | 
| 14 26 | 
             
              end
         | 
| 15 27 |  | 
| 16 28 | 
             
              define :long_words do
         | 
| 17 29 | 
             
                require 'rbbt/ner/segment/token'
         | 
| 18 | 
            -
                 | 
| 30 | 
            +
                tokenize(text).select{|tok| tok.length > 5}
         | 
| 19 31 | 
             
              end
         | 
| 20 32 |  | 
| 21 33 | 
             
              define :short_words do
         | 
| 22 34 | 
             
                require 'rbbt/ner/segment/token'
         | 
| 23 | 
            -
                 | 
| 35 | 
            +
                tokenize(text).select{|tok| tok.length < 5}
         | 
| 24 36 | 
             
              end
         | 
| 25 37 |  | 
| 26 38 | 
             
              define :even_words do
         | 
| 27 39 | 
             
                require 'rbbt/ner/segment/token'
         | 
| 28 | 
            -
                 | 
| 40 | 
            +
                tokenize(text).select{|tok| tok.length % 2 == 0}
         | 
| 29 41 | 
             
              end
         | 
| 30 42 |  | 
| 31 43 | 
             
              define :missing do
         | 
| @@ -110,7 +122,7 @@ another sentence. | |
| 110 122 | 
             
                  doc = Document.new(dir)
         | 
| 111 123 | 
             
                  doc.text = text
         | 
| 112 124 |  | 
| 113 | 
            -
                  sentence = doc.sentences.last
         | 
| 125 | 
            +
                  sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
         | 
| 114 126 | 
             
                  doc.load_into sentence, :tokens 
         | 
| 115 127 |  | 
| 116 128 | 
             
                  assert_equal 5, sentence.tokens.length
         | 
| @@ -134,7 +146,7 @@ another sentence. | |
| 134 146 | 
             
                  doc = Document.new(dir)
         | 
| 135 147 | 
             
                  doc.text = text
         | 
| 136 148 |  | 
| 137 | 
            -
                  sentence = doc.sentences.last
         | 
| 149 | 
            +
                  sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
         | 
| 138 150 | 
             
                  Misc.benchmark(1) do
         | 
| 139 151 | 
             
                    doc = Document.new(dir)
         | 
| 140 152 | 
             
                    doc.text = text
         | 
| @@ -166,7 +178,7 @@ another sentence. | |
| 166 178 | 
             
                  doc = Document.new(dir)
         | 
| 167 179 | 
             
                  doc.text = text * 10
         | 
| 168 180 |  | 
| 169 | 
            -
                  sentence = doc.sentences.last
         | 
| 181 | 
            +
                  sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
         | 
| 170 182 |  | 
| 171 183 | 
             
                  doc.load_into sentence, :tokens, :long_words
         | 
| 172 184 |  | 
| @@ -178,9 +190,9 @@ another sentence. | |
| 178 190 | 
             
                  doc = Document.new(dir)
         | 
| 179 191 | 
             
                  doc.text = text * 10
         | 
| 180 192 | 
             
                  doc.sentences
         | 
| 181 | 
            -
                  assert_equal sentence, doc.sentences.last
         | 
| 193 | 
            +
                  assert_equal sentence, doc.sentences.sort_by{|sentence| sentence.offset}.last
         | 
| 182 194 |  | 
| 183 | 
            -
                  sentence = doc.sentences.last
         | 
| 195 | 
            +
                  sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
         | 
| 184 196 | 
             
                  doc.load_into sentence, :tokens, :long_words
         | 
| 185 197 |  | 
| 186 198 | 
             
                  assert_equal 2, sentence.long_words.length
         | 
| @@ -211,7 +223,7 @@ another sentence. | |
| 211 223 | 
             
                  doc.text = text * 10
         | 
| 212 224 | 
             
                  doc.docid = "TEST"
         | 
| 213 225 |  | 
| 214 | 
            -
                  sentence = doc.sentences.last
         | 
| 226 | 
            +
                  sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
         | 
| 215 227 |  | 
| 216 228 | 
             
                  doc.load_into sentence, :tokens, :long_words, :short_words, :even_words
         | 
| 217 229 |  | 
| @@ -1,9 +1,11 @@ | |
| 1 1 | 
             
            require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
         | 
| 2 2 | 
             
            require 'rbbt/ner/segment/transformed'
         | 
| 3 3 | 
             
            require 'rbbt/ner/segment/named_entity'
         | 
| 4 | 
            +
            require 'rexml/document'
         | 
| 5 | 
            +
            require 'rand'
         | 
| 4 6 |  | 
| 5 7 | 
             
            class TestClass < Test::Unit::TestCase
         | 
| 6 | 
            -
              def  | 
| 8 | 
            +
              def tttest_transform
         | 
| 7 9 | 
             
                a = "This sentence mentions the TP53 gene and the CDK5 protein"
         | 
| 8 10 | 
             
                original = a.dup
         | 
| 9 11 |  | 
| @@ -56,11 +58,13 @@ class TestClass < Test::Unit::TestCase | |
| 56 58 | 
             
                Transformed.with_transform(a, [gene1], "GN") do 
         | 
| 57 59 | 
             
                  assert_equal original.sub("TP53", 'GN'), a
         | 
| 58 60 | 
             
                end
         | 
| 61 | 
            +
             | 
| 59 62 | 
             
                assert_equal original, a
         | 
| 60 63 |  | 
| 61 | 
            -
                Transformed.with_transform(a, [gene1,gene2], "GN") do 
         | 
| 64 | 
            +
                Transformed.with_transform(a, [gene1, gene2], "GN") do 
         | 
| 62 65 | 
             
                  assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
         | 
| 63 66 | 
             
                end
         | 
| 67 | 
            +
             | 
| 64 68 | 
             
                assert_equal original, a
         | 
| 65 69 |  | 
| 66 70 | 
             
                Transformed.with_transform(a, [gene1], "GN") do 
         | 
| @@ -69,6 +73,7 @@ class TestClass < Test::Unit::TestCase | |
| 69 73 | 
             
                  end
         | 
| 70 74 | 
             
                  assert_equal original.gsub(/TP53/, 'GN'), a
         | 
| 71 75 | 
             
                end
         | 
| 76 | 
            +
             | 
| 72 77 | 
             
                assert_equal original, a
         | 
| 73 78 |  | 
| 74 79 | 
             
                exp1, exp2 = nil, nil
         | 
| @@ -169,7 +174,37 @@ class TestClass < Test::Unit::TestCase | |
| 169 174 | 
             
                    assert_equal one, a
         | 
| 170 175 | 
             
                  end
         | 
| 171 176 | 
             
                end
         | 
| 172 | 
            -
             | 
| 173 177 | 
             
              end
         | 
| 178 | 
            +
             | 
| 179 | 
            +
              def test_error
         | 
| 180 | 
            +
                a = "Do not have a diagnosis of another hereditary APC resistance/Factor V Leiden, Protein S or C deficiency, prothrombin gene mutation (G20210A), or acquired (lupus anticoagulant) thrombophilic disorder"
         | 
| 181 | 
            +
             | 
| 182 | 
            +
                entity1 = "gene"
         | 
| 183 | 
            +
                entity1.extend NamedEntity
         | 
| 184 | 
            +
                entity1.offset = a.index entity1
         | 
| 185 | 
            +
                entity1.type = "Gene"
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                entity2 = "prothrombin gene mutation"
         | 
| 188 | 
            +
                entity2.extend NamedEntity
         | 
| 189 | 
            +
                entity2.offset = a.index entity2
         | 
| 190 | 
            +
                entity2.type = "Mutation"
         | 
| 191 | 
            +
             | 
| 192 | 
            +
                entity3 = "Protein S or C"
         | 
| 193 | 
            +
                entity3.extend NamedEntity
         | 
| 194 | 
            +
                entity3.offset = a.index entity3
         | 
| 195 | 
            +
                entity3.type = "Gene"
         | 
| 196 | 
            +
             | 
| 197 | 
            +
                entity4 = "prothrombin gene mutation"
         | 
| 198 | 
            +
                entity4.extend NamedEntity
         | 
| 199 | 
            +
                entity4.offset = a.index entity2
         | 
| 200 | 
            +
                entity4.type = "Disease"
         | 
| 201 | 
            +
             | 
| 202 | 
            +
             | 
| 203 | 
            +
                Transformed.with_transform(a, [entity1].sort_by{rand}, Proc.new{|e| e.html}) do 
         | 
| 204 | 
            +
                  Transformed.with_transform(a, [entity3, entity2, entity4].sort_by{rand}, Proc.new{|e| e.html}) do 
         | 
| 205 | 
            +
                    assert_nothing_raised{REXML::Document.new "<xml>"+ a + "</xml>"}
         | 
| 206 | 
            +
                  end
         | 
| 207 | 
            +
                end
         | 
| 208 | 
            +
               end
         | 
| 174 209 | 
             
            end
         | 
| 175 210 |  | 
    
        metadata
    CHANGED
    
    | @@ -1,13 +1,13 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 2 | 
             
            name: rbbt-text
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            -
              hash:  | 
| 4 | 
            +
              hash: 1
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
              segments: 
         | 
| 7 7 | 
             
              - 0
         | 
| 8 8 | 
             
              - 6
         | 
| 9 | 
            -
              -  | 
| 10 | 
            -
              version: 0.6. | 
| 9 | 
            +
              - 3
         | 
| 10 | 
            +
              version: 0.6.3
         | 
| 11 11 | 
             
            platform: ruby
         | 
| 12 12 | 
             
            authors: 
         | 
| 13 13 | 
             
            - Miguel Vazquez
         | 
| @@ -15,8 +15,7 @@ autorequire: | |
| 15 15 | 
             
            bindir: bin
         | 
| 16 16 | 
             
            cert_chain: []
         | 
| 17 17 |  | 
| 18 | 
            -
            date:  | 
| 19 | 
            -
            default_executable: get_ppis.rb
         | 
| 18 | 
            +
            date: 2012-02-09 00:00:00 Z
         | 
| 20 19 | 
             
            dependencies: 
         | 
| 21 20 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| 22 21 | 
             
              name: rbbt-util
         | 
| @@ -106,6 +105,7 @@ files: | |
| 106 105 | 
             
            - lib/rbbt/corpus/document.rb
         | 
| 107 106 | 
             
            - lib/rbbt/corpus/document_repo.rb
         | 
| 108 107 | 
             
            - lib/rbbt/corpus/sources/pubmed.rb
         | 
| 108 | 
            +
            - lib/rbbt/entity/document.rb
         | 
| 109 109 | 
             
            - lib/rbbt/ner/NER.rb
         | 
| 110 110 | 
             
            - lib/rbbt/ner/abner.rb
         | 
| 111 111 | 
             
            - lib/rbbt/ner/banner.rb
         | 
| @@ -161,7 +161,6 @@ files: | |
| 161 161 | 
             
            - test/rbbt/corpus/test_corpus.rb
         | 
| 162 162 | 
             
            - test/rbbt/corpus/test_document.rb
         | 
| 163 163 | 
             
            - bin/get_ppis.rb
         | 
| 164 | 
            -
            has_rdoc: true
         | 
| 165 164 | 
             
            homepage: http://github.com/mikisvaz/rbbt-util
         | 
| 166 165 | 
             
            licenses: []
         | 
| 167 166 |  | 
| @@ -191,7 +190,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 191 190 | 
             
            requirements: []
         | 
| 192 191 |  | 
| 193 192 | 
             
            rubyforge_project: 
         | 
| 194 | 
            -
            rubygems_version: 1. | 
| 193 | 
            +
            rubygems_version: 1.8.10
         | 
| 195 194 | 
             
            signing_key: 
         | 
| 196 195 | 
             
            specification_version: 3
         | 
| 197 196 | 
             
            summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
         |