text_sentencer 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/bin/text_sentencer +19 -19
- data/lib/text_sentencer/string_scan_offset.rb +7 -7
- data/lib/text_sentencer/text_sentencer.rb +111 -111
- metadata +3 -4
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 | 
            -
             | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 2 | 
            +
            SHA256:
         | 
| 3 | 
            +
              metadata.gz: b8dda6323a11356aa2f6c78fa0281658ea677f09fbf0bfb19b19330f732e72c1
         | 
| 4 | 
            +
              data.tar.gz: a41e5cc5ae3e1e294e9ba4921993e2d98003d74435fd5bc099c6c6fbf188554a
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: b39ec569c1e988f4e7936385395ff1e34c15cb7721ffba6fb2453f633db360f092b5225d6ec4603b2beb391e2660291e9d48e912359eb33db9d2ccd1906d806f
         | 
| 7 | 
            +
              data.tar.gz: 2780a4c3f6ac1fc7f0ab06e658a293b7e9a7d3f5a2df0a0904fa20aea187857c9f3bad7cc7adfb6f1eba4c57897f433c03d77a958559e09225b60c154c9488b3
         | 
    
        data/bin/text_sentencer
    CHANGED
    
    | @@ -8,20 +8,20 @@ output_mode = :sentences | |
| 8 8 | 
             
            ## command line option processing
         | 
| 9 9 | 
             
            require 'optparse'
         | 
| 10 10 | 
             
            optparse = OptionParser.new do |opts|
         | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 16 | 
            -
             | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 11 | 
            +
            	opts.banner = "Usage: text_sentencer [options]"
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            	opts.on('-c', '--config=config_filename', 'specifies the configuration JSON file.') do |c|
         | 
| 14 | 
            +
            		config_filename = c
         | 
| 15 | 
            +
            	end
         | 
| 16 | 
            +
            		
         | 
| 17 | 
            +
            	opts.on('-j', '--json_output', 'outputs the result in JSON.') do
         | 
| 18 | 
            +
            		output_mode = :json
         | 
| 19 | 
            +
            	end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            	opts.on('-h', '--help', 'displays this screen.') do
         | 
| 22 | 
            +
            		puts opts
         | 
| 23 | 
            +
            		exit
         | 
| 24 | 
            +
            	end
         | 
| 25 25 | 
             
            end
         | 
| 26 26 |  | 
| 27 27 | 
             
            optparse.parse!
         | 
| @@ -36,10 +36,10 @@ text = ARGF.read | |
| 36 36 | 
             
            annotations = sentencer.annotate(text)
         | 
| 37 37 |  | 
| 38 38 | 
             
            if output_mode == :json
         | 
| 39 | 
            -
             | 
| 39 | 
            +
            	puts JSON.pretty_generate(annotations)
         | 
| 40 40 | 
             
            else
         | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 41 | 
            +
            	annotations[:denotations].each do |d|
         | 
| 42 | 
            +
            		span = d[:span]
         | 
| 43 | 
            +
            		puts text[span[:begin]...span[:end]]
         | 
| 44 | 
            +
            	end
         | 
| 45 45 | 
             
            end
         | 
| @@ -3,115 +3,115 @@ require 'text_sentencer/string_scan_offset' | |
| 3 3 | 
             
            require 'pp'
         | 
| 4 4 |  | 
| 5 5 | 
             
            class TextSentencer
         | 
| 6 | 
            -
             | 
| 7 | 
            -
             | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 16 | 
            -
             | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 31 | 
            -
             | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
| 47 | 
            -
             | 
| 48 | 
            -
             | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
             | 
| 64 | 
            -
             | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
| 71 | 
            -
             | 
| 72 | 
            -
             | 
| 73 | 
            -
             | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
             | 
| 78 | 
            -
             | 
| 79 | 
            -
             | 
| 80 | 
            -
             | 
| 81 | 
            -
             | 
| 82 | 
            -
             | 
| 83 | 
            -
             | 
| 84 | 
            -
             | 
| 85 | 
            -
             | 
| 86 | 
            -
             | 
| 87 | 
            -
             | 
| 88 | 
            -
             | 
| 89 | 
            -
             | 
| 90 | 
            -
             | 
| 91 | 
            -
             | 
| 92 | 
            -
             | 
| 93 | 
            -
             | 
| 94 | 
            -
             | 
| 95 | 
            -
             | 
| 96 | 
            -
             | 
| 97 | 
            -
             | 
| 98 | 
            -
             | 
| 99 | 
            -
             | 
| 100 | 
            -
             | 
| 101 | 
            -
             | 
| 102 | 
            -
             | 
| 103 | 
            -
             | 
| 104 | 
            -
             | 
| 105 | 
            -
             | 
| 106 | 
            -
             | 
| 107 | 
            -
             | 
| 108 | 
            -
             | 
| 109 | 
            -
             | 
| 110 | 
            -
             | 
| 111 | 
            -
             | 
| 112 | 
            -
             | 
| 113 | 
            -
             | 
| 114 | 
            -
             | 
| 115 | 
            -
             | 
| 116 | 
            -
             | 
| 6 | 
            +
            	## default rules
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            	DEFAULT_RULES = {
         | 
| 9 | 
            +
            		# All the positions of new line characters always take sentence break.
         | 
| 10 | 
            +
            		break_pattern: "([ \t]*\n+)+[ \t]*",
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            		# All the positions of space and tab characters are candiates of sentence break.
         | 
| 13 | 
            +
            		candidate_pattern: "[ \t]+",
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            		# First, positive rules are applied to the break candidates to make initial segmantations.
         | 
| 16 | 
            +
            		positive_rules: [
         | 
| 17 | 
            +
            			['[.!?]', '[0-9A-Z]'],
         | 
| 18 | 
            +
            			['[:]', '[0-9]'],
         | 
| 19 | 
            +
            			['[:]', '[A-Z][a-z]']
         | 
| 20 | 
            +
            		],
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            		# Then, negative rules are applied to cancel some initial segmentations.
         | 
| 23 | 
            +
            		negative_rules: [
         | 
| 24 | 
            +
            			# Titles before names
         | 
| 25 | 
            +
            			['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            			# Titles usually before names, but ..
         | 
| 28 | 
            +
            			['(Sr|Jr)\.', '[A-Z][a-z]'],
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            			# Single letter abbriveations, e.g. middle name
         | 
| 31 | 
            +
            			# ['\b[A-Z]\.', '[A-Z][a-z]'],
         | 
| 32 | 
            +
             | 
| 33 | 
            +
            			# Abbriveations, e.g. middle name
         | 
| 34 | 
            +
            			['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            			# Frequent abbreviations that will never appear in the end of a sentence
         | 
| 37 | 
            +
            			['(cf|vs)\.', ''],
         | 
| 38 | 
            +
            			['e\.g\.', ''],
         | 
| 39 | 
            +
            			['i\.e\.', ''],
         | 
| 40 | 
            +
             | 
| 41 | 
            +
            			# Others
         | 
| 42 | 
            +
            			['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
         | 
| 43 | 
            +
            		]
         | 
| 44 | 
            +
            	}
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            	def initialize(rules = nil)
         | 
| 47 | 
            +
            		rules ||= DEFAULT_RULES
         | 
| 48 | 
            +
            		@rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
         | 
| 49 | 
            +
            		@rules[:break_pattern] ||= ""
         | 
| 50 | 
            +
            		@rules[:candidate_pattern] ||= ""
         | 
| 51 | 
            +
            		@rules[:positive_rules] ||= []
         | 
| 52 | 
            +
            		@rules[:negative_rules] ||= []
         | 
| 53 | 
            +
            	end
         | 
| 54 | 
            +
             | 
| 55 | 
            +
            	def annotate(text)
         | 
| 56 | 
            +
            		return nil if text.nil?
         | 
| 57 | 
            +
             | 
| 58 | 
            +
            		sentences = segment(text)
         | 
| 59 | 
            +
            		denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
         | 
| 60 | 
            +
            		{text:text, denotations:denotations}
         | 
| 61 | 
            +
            	end
         | 
| 62 | 
            +
             | 
| 63 | 
            +
            	def segment(text)
         | 
| 64 | 
            +
            		breaks = if @rules[:break_pattern].empty?
         | 
| 65 | 
            +
            			[]
         | 
| 66 | 
            +
            		else
         | 
| 67 | 
            +
            			text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)}
         | 
| 68 | 
            +
            		end
         | 
| 69 | 
            +
             | 
| 70 | 
            +
            		candidates = if @rules[:candidate_pattern].empty?
         | 
| 71 | 
            +
            			[]
         | 
| 72 | 
            +
            		else
         | 
| 73 | 
            +
            			text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)}
         | 
| 74 | 
            +
            		end
         | 
| 75 | 
            +
             | 
| 76 | 
            +
            		# breaks take precedent
         | 
| 77 | 
            +
            		candidates -= breaks
         | 
| 78 | 
            +
             | 
| 79 | 
            +
            		candidates.each do |c|
         | 
| 80 | 
            +
            			last_end, next_begin = c
         | 
| 81 | 
            +
             | 
| 82 | 
            +
            			if (last_end == 0) || (next_begin == text.length)
         | 
| 83 | 
            +
            				breaks << c
         | 
| 84 | 
            +
            				next
         | 
| 85 | 
            +
            			end
         | 
| 86 | 
            +
             | 
| 87 | 
            +
            			last_text = text[0...last_end]
         | 
| 88 | 
            +
            			next_text = text[next_begin..-1]
         | 
| 89 | 
            +
             | 
| 90 | 
            +
            			@rules[:positive_rules].each do |p|
         | 
| 91 | 
            +
            				if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/)
         | 
| 92 | 
            +
            					break_p = true
         | 
| 93 | 
            +
            					@rules[:negative_rules].each do |n|
         | 
| 94 | 
            +
            						if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/)
         | 
| 95 | 
            +
            							break_p = false
         | 
| 96 | 
            +
            							break
         | 
| 97 | 
            +
            						end
         | 
| 98 | 
            +
            					end
         | 
| 99 | 
            +
            					breaks << c if break_p
         | 
| 100 | 
            +
            					break
         | 
| 101 | 
            +
            				end
         | 
| 102 | 
            +
            			end
         | 
| 103 | 
            +
            		end
         | 
| 104 | 
            +
             | 
| 105 | 
            +
            		breaks.sort!
         | 
| 106 | 
            +
             | 
| 107 | 
            +
            		sentences = []
         | 
| 108 | 
            +
            		lastbreak = 0
         | 
| 109 | 
            +
            		breaks.each do |b|
         | 
| 110 | 
            +
            			sentences << [lastbreak, b[0]] if b[0] > lastbreak
         | 
| 111 | 
            +
            			lastbreak = b[1]
         | 
| 112 | 
            +
            		end
         | 
| 113 | 
            +
            		sentences << [lastbreak, text.length] if lastbreak < text.length
         | 
| 114 | 
            +
             | 
| 115 | 
            +
            		sentences
         | 
| 116 | 
            +
            	end
         | 
| 117 117 | 
             
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: text_sentencer
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 1.0. | 
| 4 | 
            +
              version: 1.0.3
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Jin-Dong Kim
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date:  | 
| 11 | 
            +
            date: 2021-01-17 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies: []
         | 
| 13 13 | 
             
            description: TextSentencer is a simple rule-based system for segmenting text into
         | 
| 14 14 | 
             
              sentences.
         | 
| @@ -41,8 +41,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 41 41 | 
             
                - !ruby/object:Gem::Version
         | 
| 42 42 | 
             
                  version: '0'
         | 
| 43 43 | 
             
            requirements: []
         | 
| 44 | 
            -
             | 
| 45 | 
            -
            rubygems_version: 2.4.8
         | 
| 44 | 
            +
            rubygems_version: 3.2.3
         | 
| 46 45 | 
             
            signing_key: 
         | 
| 47 46 | 
             
            specification_version: 4
         | 
| 48 47 | 
             
            summary: A simple, rule-based script to find sentence boundaries in text.
         |