text_sentencer 1.0.2 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 7008e00d8e1554e27e82bbf744eb74bf10704234
4
- data.tar.gz: e863a486fe7f932c6354abd314c6e9690f4b1ece
2
+ SHA256:
3
+ metadata.gz: 13537fd663911db81cc5d4068c4c3d8439d332c4742e9d027c6687031f44f32f
4
+ data.tar.gz: dbd6aacd4e7dfca434c27608fa747bb7fbd8d43bf62af682f0e55784fcccf748
5
5
  SHA512:
6
- metadata.gz: a2762e82db9eece26fd97ec7ac3c44f13ad3fe9babe2910238d1ed5e4a616d3afbaac3ff421d6dda354d3ceb708ca0a5ce7877563f253bf726599f898f0a5f44
7
- data.tar.gz: 9ee5c91045041905f61218944388157fdc7ffb53b6d7f8bd784ef8a286aa2acdf6cca524fb31f4a1e0725466bdbe130de55ef4b00535deb7bf5ae1b65c6ed49b
6
+ metadata.gz: 9758707a8204ebe212e699f2f3dd2011d3ad0fdf355cde4c85df5521d17aa6763df593915b88890aad9c29c10903bbada40be9ec66cc082305fdf5b614c252bb
7
+ data.tar.gz: '09938a12984f7d33c6337adcaf59af9087ce2969d5d98b37b3e73d833d015a49950b6d7b65bc845dc0820727ea4d2b5af61a8bb0af7ce5201b7531748f8772a8'
data/bin/text_sentencer CHANGED
@@ -8,20 +8,20 @@ output_mode = :sentences
8
8
  ## command line option processing
9
9
  require 'optparse'
10
10
  optparse = OptionParser.new do |opts|
11
- opts.banner = "Usage: text_sentencer [options]"
12
-
13
- opts.on('-c', '--config=config_filename', 'specifies the configuration JSON file.') do |c|
14
- config_filename = c
15
- end
16
-
17
- opts.on('-j', '--json_output', 'outputs the result in JSON.') do
18
- output_mode = :json
19
- end
20
-
21
- opts.on('-h', '--help', 'displays this screen.') do
22
- puts opts
23
- exit
24
- end
11
+ opts.banner = "Usage: text_sentencer [options]"
12
+
13
+ opts.on('-c', '--config=config_filename', 'specifies the configuration JSON file.') do |c|
14
+ config_filename = c
15
+ end
16
+
17
+ opts.on('-j', '--json_output', 'outputs the result in JSON.') do
18
+ output_mode = :json
19
+ end
20
+
21
+ opts.on('-h', '--help', 'displays this screen.') do
22
+ puts opts
23
+ exit
24
+ end
25
25
  end
26
26
 
27
27
  optparse.parse!
@@ -36,10 +36,12 @@ text = ARGF.read
36
36
  annotations = sentencer.annotate(text)
37
37
 
38
38
  if output_mode == :json
39
- puts JSON.pretty_generate(annotations)
39
+ puts JSON.pretty_generate(annotations)
40
40
  else
41
- annotations[:denotations].each do |d|
42
- span = d[:span]
43
- puts text[span[:begin]...span[:end]]
44
- end
41
+ if annotations.has_key?(:blocks)
42
+ annotations[:blocks].each do |d|
43
+ span = d[:span]
44
+ puts '[' + text[span[:begin]...span[:end]] + ']'
45
+ end
46
+ end
45
47
  end
@@ -1,9 +1,9 @@
1
1
  class String
2
- def scan_offset(regex)
3
- Enumerator.new do |y|
4
- self.scan(regex) do
5
- y << Regexp.last_match
6
- end
7
- end
8
- end
2
+ def scan_offset(regex)
3
+ Enumerator.new do |y|
4
+ self.scan(regex) do
5
+ y << Regexp.last_match
6
+ end
7
+ end
8
+ end
9
9
  end
@@ -3,115 +3,105 @@ require 'text_sentencer/string_scan_offset'
3
3
  require 'pp'
4
4
 
5
5
  class TextSentencer
6
- ## default rules
7
-
8
- DEFAULT_RULES = {
9
- # All the positions of new line characters always take sentence break.
10
- break_pattern: "([ \t]*\n+)+[ \t]*",
11
-
12
- # All the positions of space and tab characters are candiates of sentence break.
13
- candidate_pattern: "[ \t]+",
14
-
15
- # First, positive rules are applied to the break candidates to make initial segmantations.
16
- positive_rules: [
17
- ['[.!?]', '[0-9A-Z]'],
18
- ['[:]', '[0-9]'],
19
- ['[:]', '[A-Z][a-z]']
20
- ],
21
-
22
- # Then, negative rules are applied to cancel some initial segmentations.
23
- negative_rules: [
24
- # Titles before names
25
- ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
26
-
27
- # Titles usually before names, but ..
28
- ['(Sr|Jr)\.', '[A-Z][a-z]'],
29
-
30
- # Single letter abbriveations, e.g. middle name
31
- # ['\b[A-Z]\.', '[A-Z][a-z]'],
32
-
33
- # Abbriveations, e.g. middle name
34
- ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
35
-
36
- # Frequent abbreviations that will never appear in the end of a sentence
37
- ['(cf|vs)\.', ''],
38
- ['e\.g\.', ''],
39
- ['i\.e\.', ''],
40
-
41
- # Others
42
- ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
43
- ]
44
- }
45
-
46
- def initialize(rules = nil)
47
- rules ||= DEFAULT_RULES
48
- @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
49
- @rules[:break_pattern] ||= ""
50
- @rules[:candidate_pattern] ||= ""
51
- @rules[:positive_rules] ||= []
52
- @rules[:negative_rules] ||= []
53
- end
54
-
55
- def annotate(text)
56
- return nil if text.nil?
57
-
58
- sentences = segment(text)
59
- denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
60
- {text:text, denotations:denotations}
61
- end
62
-
63
- def segment(text)
64
- breaks = if @rules[:break_pattern].empty?
65
- []
66
- else
67
- text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)}
68
- end
69
-
70
- candidates = if @rules[:candidate_pattern].empty?
71
- []
72
- else
73
- text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)}
74
- end
75
-
76
- # breaks take precedent
77
- candidates -= breaks
78
-
79
- candidates.each do |c|
80
- last_end, next_begin = c
81
-
82
- if (last_end == 0) || (next_begin == text.length)
83
- breaks << c
84
- next
85
- end
86
-
87
- last_text = text[0...last_end]
88
- next_text = text[next_begin..-1]
89
-
90
- @rules[:positive_rules].each do |p|
91
- if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/)
92
- break_p = true
93
- @rules[:negative_rules].each do |n|
94
- if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/)
95
- break_p = false
96
- break
97
- end
98
- end
99
- breaks << c if break_p
100
- break
101
- end
102
- end
103
- end
104
-
105
- breaks.sort!
106
-
107
- sentences = []
108
- lastbreak = 0
109
- breaks.each do |b|
110
- sentences << [lastbreak, b[0]] if b[0] > lastbreak
111
- lastbreak = b[1]
112
- end
113
- sentences << [lastbreak, text.length] if lastbreak < text.length
114
-
115
- sentences
116
- end
6
+ ## default rules
7
+
8
+ DEFAULT_RULES = {
9
+ # All the positions of new line characters always take sentence break.
10
+ break_pattern: /([ \t]*\n+)+[ \t]*/,
11
+
12
+ # All the positions of space and tab characters are candiates of sentence break.
13
+ candidate_pattern: /[ \t\n]+/,
14
+
15
+ # First, positive rules are applied to the break candidates to make initial segmantations.
16
+ positive_rules: [
17
+ [/[.!?]\z/, /\A[0-9A-Z]/],
18
+ [/[:]\z/, /\A[0-9]/],
19
+ [/[:]\z/, /\A[A-Z][a-z]/]
20
+ ],
21
+
22
+ # Then, negative rules are applied to cancel some initial segmentations.
23
+ negative_rules: [
24
+ # Titles before names
25
+ [/(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.\z/, /\A[A-Z][a-z]/],
26
+
27
+ # Titles usually before names, but ..
28
+ [/(Sr|Jr)\.\z/, /\A[A-Z][a-z]/],
29
+
30
+ # Single letter abbriveations, e.g. middle name
31
+ # [/\b[A-Z]\.\z/, /\A[A-Z][a-z]/],
32
+
33
+ # Abbriveations, e.g. middle name
34
+ [/\b[A-Z][a-z]*\.\z/, /\A[0-9A-Z]/],
35
+
36
+ # Frequent abbreviations that will never appear in the end of a sentence
37
+ [/(cf|vs)\.\z/, //],
38
+ [/e\.g\.\z/, //],
39
+ [/i\.e\.\z/, //],
40
+
41
+ # Others
42
+ [/(Sec|Chap|Fig|Eq)\.\z/, /\A[0-9A-Z]/]
43
+ ]
44
+ }
45
+
46
+ def initialize(rules = nil)
47
+ rules ||= DEFAULT_RULES
48
+ @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
49
+ @rules[:break_pattern] ||= nil
50
+ @rules[:candidate_pattern] ||= nil
51
+ @rules[:positive_rules] ||= []
52
+ @rules[:negative_rules] ||= []
53
+ end
54
+
55
+ def annotate(text)
56
+ sentences = segment(text)
57
+ blocks = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
58
+ {text:text, blocks:blocks}
59
+ end
60
+
61
+ def segment(text)
62
+ # hard sentence breaks
63
+ breaks = if @rules[:break_pattern].nil?
64
+ []
65
+ else
66
+ text.scan_offset(@rules[:break_pattern]).map{|m| m.offset(0)}
67
+ end
68
+
69
+ # candidates of sentence breaks
70
+ candidates = if @rules[:candidate_pattern].nil?
71
+ []
72
+ else
73
+ text.scan_offset(@rules[:candidate_pattern]).map{|m| m.offset(0)}
74
+ end
75
+
76
+ # hard sentence breaks are already in
77
+ candidates -= breaks
78
+
79
+ # filter using the positive rules. The first and last breaks remain.
80
+ candidates.select! do |c|
81
+ c[0] == 0 ||
82
+ c[1] == text.length ||
83
+ @rules[:positive_rules].any? do |r|
84
+ (text[0...c[0]] =~ r[0]) && (text[c[1]...-1] =~ r[1])
85
+ end
86
+ end
87
+
88
+ # filter using the negative rules
89
+ candidates.reject! do |c|
90
+ @rules[:negative_rules].any? do |r|
91
+ (text[0...c[0]] =~ r[0]) && (text[c[1]...-1] =~ r[1])
92
+ end
93
+ end
94
+
95
+ # add all the filtered candidates to sentence breaks
96
+ breaks += candidates
97
+ breaks.uniq!
98
+ breaks.sort!
99
+
100
+ # add the initial and final breaks unless already exist
101
+ breaks.unshift([0, 0]) if breaks.empty? || breaks[0][0] != 0
102
+ breaks.push([text.length, text.length]) unless breaks[-1][1] == text.length
103
+
104
+ # conver the breaks into sentences
105
+ sentences = (1 ... breaks.length).map{|i| [breaks[i-1][1], breaks[i][0]]}
106
+ end
117
107
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_sentencer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-09-07 00:00:00.000000000 Z
11
+ date: 2024-08-03 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: TextSentencer is a simple rule-based system for segmenting text into
14
14
  sentences.
@@ -41,8 +41,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
41
41
  - !ruby/object:Gem::Version
42
42
  version: '0'
43
43
  requirements: []
44
- rubyforge_project:
45
- rubygems_version: 2.4.8
44
+ rubygems_version: 3.5.11
46
45
  signing_key:
47
46
  specification_version: 4
48
47
  summary: A simple, rule-based script to find sentence boundaries in text.