text_sentencer 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 7008e00d8e1554e27e82bbf744eb74bf10704234
4
- data.tar.gz: e863a486fe7f932c6354abd314c6e9690f4b1ece
2
+ SHA256:
3
+ metadata.gz: b8dda6323a11356aa2f6c78fa0281658ea677f09fbf0bfb19b19330f732e72c1
4
+ data.tar.gz: a41e5cc5ae3e1e294e9ba4921993e2d98003d74435fd5bc099c6c6fbf188554a
5
5
  SHA512:
6
- metadata.gz: a2762e82db9eece26fd97ec7ac3c44f13ad3fe9babe2910238d1ed5e4a616d3afbaac3ff421d6dda354d3ceb708ca0a5ce7877563f253bf726599f898f0a5f44
7
- data.tar.gz: 9ee5c91045041905f61218944388157fdc7ffb53b6d7f8bd784ef8a286aa2acdf6cca524fb31f4a1e0725466bdbe130de55ef4b00535deb7bf5ae1b65c6ed49b
6
+ metadata.gz: b39ec569c1e988f4e7936385395ff1e34c15cb7721ffba6fb2453f633db360f092b5225d6ec4603b2beb391e2660291e9d48e912359eb33db9d2ccd1906d806f
7
+ data.tar.gz: 2780a4c3f6ac1fc7f0ab06e658a293b7e9a7d3f5a2df0a0904fa20aea187857c9f3bad7cc7adfb6f1eba4c57897f433c03d77a958559e09225b60c154c9488b3
@@ -8,20 +8,20 @@ output_mode = :sentences
8
8
  ## command line option processing
9
9
  require 'optparse'
10
10
  optparse = OptionParser.new do |opts|
11
- opts.banner = "Usage: text_sentencer [options]"
12
-
13
- opts.on('-c', '--config=config_filename', 'specifies the configuration JSON file.') do |c|
14
- config_filename = c
15
- end
16
-
17
- opts.on('-j', '--json_output', 'outputs the result in JSON.') do
18
- output_mode = :json
19
- end
20
-
21
- opts.on('-h', '--help', 'displays this screen.') do
22
- puts opts
23
- exit
24
- end
11
+ opts.banner = "Usage: text_sentencer [options]"
12
+
13
+ opts.on('-c', '--config=config_filename', 'specifies the configuration JSON file.') do |c|
14
+ config_filename = c
15
+ end
16
+
17
+ opts.on('-j', '--json_output', 'outputs the result in JSON.') do
18
+ output_mode = :json
19
+ end
20
+
21
+ opts.on('-h', '--help', 'displays this screen.') do
22
+ puts opts
23
+ exit
24
+ end
25
25
  end
26
26
 
27
27
  optparse.parse!
@@ -36,10 +36,10 @@ text = ARGF.read
36
36
  annotations = sentencer.annotate(text)
37
37
 
38
38
  if output_mode == :json
39
- puts JSON.pretty_generate(annotations)
39
+ puts JSON.pretty_generate(annotations)
40
40
  else
41
- annotations[:denotations].each do |d|
42
- span = d[:span]
43
- puts text[span[:begin]...span[:end]]
44
- end
41
+ annotations[:denotations].each do |d|
42
+ span = d[:span]
43
+ puts text[span[:begin]...span[:end]]
44
+ end
45
45
  end
@@ -1,9 +1,9 @@
1
1
  class String
2
- def scan_offset(regex)
3
- Enumerator.new do |y|
4
- self.scan(regex) do
5
- y << Regexp.last_match
6
- end
7
- end
8
- end
2
+ def scan_offset(regex)
3
+ Enumerator.new do |y|
4
+ self.scan(regex) do
5
+ y << Regexp.last_match
6
+ end
7
+ end
8
+ end
9
9
  end
@@ -3,115 +3,115 @@ require 'text_sentencer/string_scan_offset'
3
3
  require 'pp'
4
4
 
5
5
  class TextSentencer
6
- ## default rules
7
-
8
- DEFAULT_RULES = {
9
- # All the positions of new line characters always take sentence break.
10
- break_pattern: "([ \t]*\n+)+[ \t]*",
11
-
12
- # All the positions of space and tab characters are candiates of sentence break.
13
- candidate_pattern: "[ \t]+",
14
-
15
- # First, positive rules are applied to the break candidates to make initial segmantations.
16
- positive_rules: [
17
- ['[.!?]', '[0-9A-Z]'],
18
- ['[:]', '[0-9]'],
19
- ['[:]', '[A-Z][a-z]']
20
- ],
21
-
22
- # Then, negative rules are applied to cancel some initial segmentations.
23
- negative_rules: [
24
- # Titles before names
25
- ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
26
-
27
- # Titles usually before names, but ..
28
- ['(Sr|Jr)\.', '[A-Z][a-z]'],
29
-
30
- # Single letter abbriveations, e.g. middle name
31
- # ['\b[A-Z]\.', '[A-Z][a-z]'],
32
-
33
- # Abbriveations, e.g. middle name
34
- ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
35
-
36
- # Frequent abbreviations that will never appear in the end of a sentence
37
- ['(cf|vs)\.', ''],
38
- ['e\.g\.', ''],
39
- ['i\.e\.', ''],
40
-
41
- # Others
42
- ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
43
- ]
44
- }
45
-
46
- def initialize(rules = nil)
47
- rules ||= DEFAULT_RULES
48
- @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
49
- @rules[:break_pattern] ||= ""
50
- @rules[:candidate_pattern] ||= ""
51
- @rules[:positive_rules] ||= []
52
- @rules[:negative_rules] ||= []
53
- end
54
-
55
- def annotate(text)
56
- return nil if text.nil?
57
-
58
- sentences = segment(text)
59
- denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
60
- {text:text, denotations:denotations}
61
- end
62
-
63
- def segment(text)
64
- breaks = if @rules[:break_pattern].empty?
65
- []
66
- else
67
- text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)}
68
- end
69
-
70
- candidates = if @rules[:candidate_pattern].empty?
71
- []
72
- else
73
- text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)}
74
- end
75
-
76
- # breaks take precedent
77
- candidates -= breaks
78
-
79
- candidates.each do |c|
80
- last_end, next_begin = c
81
-
82
- if (last_end == 0) || (next_begin == text.length)
83
- breaks << c
84
- next
85
- end
86
-
87
- last_text = text[0...last_end]
88
- next_text = text[next_begin..-1]
89
-
90
- @rules[:positive_rules].each do |p|
91
- if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/)
92
- break_p = true
93
- @rules[:negative_rules].each do |n|
94
- if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/)
95
- break_p = false
96
- break
97
- end
98
- end
99
- breaks << c if break_p
100
- break
101
- end
102
- end
103
- end
104
-
105
- breaks.sort!
106
-
107
- sentences = []
108
- lastbreak = 0
109
- breaks.each do |b|
110
- sentences << [lastbreak, b[0]] if b[0] > lastbreak
111
- lastbreak = b[1]
112
- end
113
- sentences << [lastbreak, text.length] if lastbreak < text.length
114
-
115
- sentences
116
- end
6
+ ## default rules
7
+
8
+ DEFAULT_RULES = {
9
+ # All the positions of new line characters always take sentence break.
10
+ break_pattern: "([ \t]*\n+)+[ \t]*",
11
+
12
+ # All the positions of space and tab characters are candiates of sentence break.
13
+ candidate_pattern: "[ \t]+",
14
+
15
+ # First, positive rules are applied to the break candidates to make initial segmantations.
16
+ positive_rules: [
17
+ ['[.!?]', '[0-9A-Z]'],
18
+ ['[:]', '[0-9]'],
19
+ ['[:]', '[A-Z][a-z]']
20
+ ],
21
+
22
+ # Then, negative rules are applied to cancel some initial segmentations.
23
+ negative_rules: [
24
+ # Titles before names
25
+ ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
26
+
27
+ # Titles usually before names, but ..
28
+ ['(Sr|Jr)\.', '[A-Z][a-z]'],
29
+
30
+ # Single letter abbriveations, e.g. middle name
31
+ # ['\b[A-Z]\.', '[A-Z][a-z]'],
32
+
33
+ # Abbriveations, e.g. middle name
34
+ ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
35
+
36
+ # Frequent abbreviations that will never appear in the end of a sentence
37
+ ['(cf|vs)\.', ''],
38
+ ['e\.g\.', ''],
39
+ ['i\.e\.', ''],
40
+
41
+ # Others
42
+ ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
43
+ ]
44
+ }
45
+
46
+ def initialize(rules = nil)
47
+ rules ||= DEFAULT_RULES
48
+ @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
49
+ @rules[:break_pattern] ||= ""
50
+ @rules[:candidate_pattern] ||= ""
51
+ @rules[:positive_rules] ||= []
52
+ @rules[:negative_rules] ||= []
53
+ end
54
+
55
+ def annotate(text)
56
+ return nil if text.nil?
57
+
58
+ sentences = segment(text)
59
+ denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
60
+ {text:text, denotations:denotations}
61
+ end
62
+
63
+ def segment(text)
64
+ breaks = if @rules[:break_pattern].empty?
65
+ []
66
+ else
67
+ text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)}
68
+ end
69
+
70
+ candidates = if @rules[:candidate_pattern].empty?
71
+ []
72
+ else
73
+ text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)}
74
+ end
75
+
76
+ # breaks take precedent
77
+ candidates -= breaks
78
+
79
+ candidates.each do |c|
80
+ last_end, next_begin = c
81
+
82
+ if (last_end == 0) || (next_begin == text.length)
83
+ breaks << c
84
+ next
85
+ end
86
+
87
+ last_text = text[0...last_end]
88
+ next_text = text[next_begin..-1]
89
+
90
+ @rules[:positive_rules].each do |p|
91
+ if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/)
92
+ break_p = true
93
+ @rules[:negative_rules].each do |n|
94
+ if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/)
95
+ break_p = false
96
+ break
97
+ end
98
+ end
99
+ breaks << c if break_p
100
+ break
101
+ end
102
+ end
103
+ end
104
+
105
+ breaks.sort!
106
+
107
+ sentences = []
108
+ lastbreak = 0
109
+ breaks.each do |b|
110
+ sentences << [lastbreak, b[0]] if b[0] > lastbreak
111
+ lastbreak = b[1]
112
+ end
113
+ sentences << [lastbreak, text.length] if lastbreak < text.length
114
+
115
+ sentences
116
+ end
117
117
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_sentencer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-09-07 00:00:00.000000000 Z
11
+ date: 2021-01-17 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: TextSentencer is a simple rule-based system for segmenting text into
14
14
  sentences.
@@ -41,8 +41,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
41
41
  - !ruby/object:Gem::Version
42
42
  version: '0'
43
43
  requirements: []
44
- rubyforge_project:
45
- rubygems_version: 2.4.8
44
+ rubygems_version: 3.2.3
46
45
  signing_key:
47
46
  specification_version: 4
48
47
  summary: A simple, rule-based script to find sentence boundaries in text.