text_sentencer 1.0.2 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 7008e00d8e1554e27e82bbf744eb74bf10704234
4
- data.tar.gz: e863a486fe7f932c6354abd314c6e9690f4b1ece
2
+ SHA256:
3
+ metadata.gz: 13537fd663911db81cc5d4068c4c3d8439d332c4742e9d027c6687031f44f32f
4
+ data.tar.gz: dbd6aacd4e7dfca434c27608fa747bb7fbd8d43bf62af682f0e55784fcccf748
5
5
  SHA512:
6
- metadata.gz: a2762e82db9eece26fd97ec7ac3c44f13ad3fe9babe2910238d1ed5e4a616d3afbaac3ff421d6dda354d3ceb708ca0a5ce7877563f253bf726599f898f0a5f44
7
- data.tar.gz: 9ee5c91045041905f61218944388157fdc7ffb53b6d7f8bd784ef8a286aa2acdf6cca524fb31f4a1e0725466bdbe130de55ef4b00535deb7bf5ae1b65c6ed49b
6
+ metadata.gz: 9758707a8204ebe212e699f2f3dd2011d3ad0fdf355cde4c85df5521d17aa6763df593915b88890aad9c29c10903bbada40be9ec66cc082305fdf5b614c252bb
7
+ data.tar.gz: '09938a12984f7d33c6337adcaf59af9087ce2969d5d98b37b3e73d833d015a49950b6d7b65bc845dc0820727ea4d2b5af61a8bb0af7ce5201b7531748f8772a8'
data/bin/text_sentencer CHANGED
@@ -8,20 +8,20 @@ output_mode = :sentences
8
8
  ## command line option processing
9
9
  require 'optparse'
10
10
  optparse = OptionParser.new do |opts|
11
- opts.banner = "Usage: text_sentencer [options]"
12
-
13
- opts.on('-c', '--config=config_filename', 'specifies the configuration JSON file.') do |c|
14
- config_filename = c
15
- end
16
-
17
- opts.on('-j', '--json_output', 'outputs the result in JSON.') do
18
- output_mode = :json
19
- end
20
-
21
- opts.on('-h', '--help', 'displays this screen.') do
22
- puts opts
23
- exit
24
- end
11
+ opts.banner = "Usage: text_sentencer [options]"
12
+
13
+ opts.on('-c', '--config=config_filename', 'specifies the configuration JSON file.') do |c|
14
+ config_filename = c
15
+ end
16
+
17
+ opts.on('-j', '--json_output', 'outputs the result in JSON.') do
18
+ output_mode = :json
19
+ end
20
+
21
+ opts.on('-h', '--help', 'displays this screen.') do
22
+ puts opts
23
+ exit
24
+ end
25
25
  end
26
26
 
27
27
  optparse.parse!
@@ -36,10 +36,12 @@ text = ARGF.read
36
36
  annotations = sentencer.annotate(text)
37
37
 
38
38
  if output_mode == :json
39
- puts JSON.pretty_generate(annotations)
39
+ puts JSON.pretty_generate(annotations)
40
40
  else
41
- annotations[:denotations].each do |d|
42
- span = d[:span]
43
- puts text[span[:begin]...span[:end]]
44
- end
41
+ if annotations.has_key?(:blocks)
42
+ annotations[:blocks].each do |d|
43
+ span = d[:span]
44
+ puts '[' + text[span[:begin]...span[:end]] + ']'
45
+ end
46
+ end
45
47
  end
@@ -1,9 +1,9 @@
1
1
  class String
2
- def scan_offset(regex)
3
- Enumerator.new do |y|
4
- self.scan(regex) do
5
- y << Regexp.last_match
6
- end
7
- end
8
- end
2
+ def scan_offset(regex)
3
+ Enumerator.new do |y|
4
+ self.scan(regex) do
5
+ y << Regexp.last_match
6
+ end
7
+ end
8
+ end
9
9
  end
@@ -3,115 +3,105 @@ require 'text_sentencer/string_scan_offset'
3
3
  require 'pp'
4
4
 
5
5
  class TextSentencer
6
- ## default rules
7
-
8
- DEFAULT_RULES = {
9
- # All the positions of new line characters always take sentence break.
10
- break_pattern: "([ \t]*\n+)+[ \t]*",
11
-
12
- # All the positions of space and tab characters are candiates of sentence break.
13
- candidate_pattern: "[ \t]+",
14
-
15
- # First, positive rules are applied to the break candidates to make initial segmantations.
16
- positive_rules: [
17
- ['[.!?]', '[0-9A-Z]'],
18
- ['[:]', '[0-9]'],
19
- ['[:]', '[A-Z][a-z]']
20
- ],
21
-
22
- # Then, negative rules are applied to cancel some initial segmentations.
23
- negative_rules: [
24
- # Titles before names
25
- ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
26
-
27
- # Titles usually before names, but ..
28
- ['(Sr|Jr)\.', '[A-Z][a-z]'],
29
-
30
- # Single letter abbriveations, e.g. middle name
31
- # ['\b[A-Z]\.', '[A-Z][a-z]'],
32
-
33
- # Abbriveations, e.g. middle name
34
- ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
35
-
36
- # Frequent abbreviations that will never appear in the end of a sentence
37
- ['(cf|vs)\.', ''],
38
- ['e\.g\.', ''],
39
- ['i\.e\.', ''],
40
-
41
- # Others
42
- ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
43
- ]
44
- }
45
-
46
- def initialize(rules = nil)
47
- rules ||= DEFAULT_RULES
48
- @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
49
- @rules[:break_pattern] ||= ""
50
- @rules[:candidate_pattern] ||= ""
51
- @rules[:positive_rules] ||= []
52
- @rules[:negative_rules] ||= []
53
- end
54
-
55
- def annotate(text)
56
- return nil if text.nil?
57
-
58
- sentences = segment(text)
59
- denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
60
- {text:text, denotations:denotations}
61
- end
62
-
63
- def segment(text)
64
- breaks = if @rules[:break_pattern].empty?
65
- []
66
- else
67
- text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)}
68
- end
69
-
70
- candidates = if @rules[:candidate_pattern].empty?
71
- []
72
- else
73
- text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)}
74
- end
75
-
76
- # breaks take precedent
77
- candidates -= breaks
78
-
79
- candidates.each do |c|
80
- last_end, next_begin = c
81
-
82
- if (last_end == 0) || (next_begin == text.length)
83
- breaks << c
84
- next
85
- end
86
-
87
- last_text = text[0...last_end]
88
- next_text = text[next_begin..-1]
89
-
90
- @rules[:positive_rules].each do |p|
91
- if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/)
92
- break_p = true
93
- @rules[:negative_rules].each do |n|
94
- if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/)
95
- break_p = false
96
- break
97
- end
98
- end
99
- breaks << c if break_p
100
- break
101
- end
102
- end
103
- end
104
-
105
- breaks.sort!
106
-
107
- sentences = []
108
- lastbreak = 0
109
- breaks.each do |b|
110
- sentences << [lastbreak, b[0]] if b[0] > lastbreak
111
- lastbreak = b[1]
112
- end
113
- sentences << [lastbreak, text.length] if lastbreak < text.length
114
-
115
- sentences
116
- end
6
+ ## default rules
7
+
8
+ DEFAULT_RULES = {
9
+ # All the positions of new line characters always take sentence break.
10
+ break_pattern: /([ \t]*\n+)+[ \t]*/,
11
+
12
+ # All the positions of space and tab characters are candiates of sentence break.
13
+ candidate_pattern: /[ \t\n]+/,
14
+
15
+ # First, positive rules are applied to the break candidates to make initial segmantations.
16
+ positive_rules: [
17
+ [/[.!?]\z/, /\A[0-9A-Z]/],
18
+ [/[:]\z/, /\A[0-9]/],
19
+ [/[:]\z/, /\A[A-Z][a-z]/]
20
+ ],
21
+
22
+ # Then, negative rules are applied to cancel some initial segmentations.
23
+ negative_rules: [
24
+ # Titles before names
25
+ [/(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.\z/, /\A[A-Z][a-z]/],
26
+
27
+ # Titles usually before names, but ..
28
+ [/(Sr|Jr)\.\z/, /\A[A-Z][a-z]/],
29
+
30
+ # Single letter abbriveations, e.g. middle name
31
+ # [/\b[A-Z]\.\z/, /\A[A-Z][a-z]/],
32
+
33
+ # Abbriveations, e.g. middle name
34
+ [/\b[A-Z][a-z]*\.\z/, /\A[0-9A-Z]/],
35
+
36
+ # Frequent abbreviations that will never appear in the end of a sentence
37
+ [/(cf|vs)\.\z/, //],
38
+ [/e\.g\.\z/, //],
39
+ [/i\.e\.\z/, //],
40
+
41
+ # Others
42
+ [/(Sec|Chap|Fig|Eq)\.\z/, /\A[0-9A-Z]/]
43
+ ]
44
+ }
45
+
46
+ def initialize(rules = nil)
47
+ rules ||= DEFAULT_RULES
48
+ @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
49
+ @rules[:break_pattern] ||= nil
50
+ @rules[:candidate_pattern] ||= nil
51
+ @rules[:positive_rules] ||= []
52
+ @rules[:negative_rules] ||= []
53
+ end
54
+
55
+ def annotate(text)
56
+ sentences = segment(text)
57
+ blocks = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
58
+ {text:text, blocks:blocks}
59
+ end
60
+
61
+ def segment(text)
62
+ # hard sentence breaks
63
+ breaks = if @rules[:break_pattern].nil?
64
+ []
65
+ else
66
+ text.scan_offset(@rules[:break_pattern]).map{|m| m.offset(0)}
67
+ end
68
+
69
+ # candidates of sentence breaks
70
+ candidates = if @rules[:candidate_pattern].nil?
71
+ []
72
+ else
73
+ text.scan_offset(@rules[:candidate_pattern]).map{|m| m.offset(0)}
74
+ end
75
+
76
+ # hard sentence breaks are already in
77
+ candidates -= breaks
78
+
79
+ # filter using the positive rules. The first and last breaks remain.
80
+ candidates.select! do |c|
81
+ c[0] == 0 ||
82
+ c[1] == text.length ||
83
+ @rules[:positive_rules].any? do |r|
84
+ (text[0...c[0]] =~ r[0]) && (text[c[1]...-1] =~ r[1])
85
+ end
86
+ end
87
+
88
+ # filter using the negative rules
89
+ candidates.reject! do |c|
90
+ @rules[:negative_rules].any? do |r|
91
+ (text[0...c[0]] =~ r[0]) && (text[c[1]...-1] =~ r[1])
92
+ end
93
+ end
94
+
95
+ # add all the filtered candidates to sentence breaks
96
+ breaks += candidates
97
+ breaks.uniq!
98
+ breaks.sort!
99
+
100
+ # add the initial and final breaks unless already exist
101
+ breaks.unshift([0, 0]) if breaks.empty? || breaks[0][0] != 0
102
+ breaks.push([text.length, text.length]) unless breaks[-1][1] == text.length
103
+
104
+ # conver the breaks into sentences
105
+ sentences = (1 ... breaks.length).map{|i| [breaks[i-1][1], breaks[i][0]]}
106
+ end
117
107
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_sentencer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-09-07 00:00:00.000000000 Z
11
+ date: 2024-08-03 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: TextSentencer is a simple rule-based system for segmenting text into
14
14
  sentences.
@@ -41,8 +41,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
41
41
  - !ruby/object:Gem::Version
42
42
  version: '0'
43
43
  requirements: []
44
- rubyforge_project:
45
- rubygems_version: 2.4.8
44
+ rubygems_version: 3.5.11
46
45
  signing_key:
47
46
  specification_version: 4
48
47
  summary: A simple, rule-based script to find sentence boundaries in text.