text_sentencer 1.0.2 → 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 7008e00d8e1554e27e82bbf744eb74bf10704234
4
- data.tar.gz: e863a486fe7f932c6354abd314c6e9690f4b1ece
2
+ SHA256:
3
+ metadata.gz: b8dda6323a11356aa2f6c78fa0281658ea677f09fbf0bfb19b19330f732e72c1
4
+ data.tar.gz: a41e5cc5ae3e1e294e9ba4921993e2d98003d74435fd5bc099c6c6fbf188554a
5
5
  SHA512:
6
- metadata.gz: a2762e82db9eece26fd97ec7ac3c44f13ad3fe9babe2910238d1ed5e4a616d3afbaac3ff421d6dda354d3ceb708ca0a5ce7877563f253bf726599f898f0a5f44
7
- data.tar.gz: 9ee5c91045041905f61218944388157fdc7ffb53b6d7f8bd784ef8a286aa2acdf6cca524fb31f4a1e0725466bdbe130de55ef4b00535deb7bf5ae1b65c6ed49b
6
+ metadata.gz: b39ec569c1e988f4e7936385395ff1e34c15cb7721ffba6fb2453f633db360f092b5225d6ec4603b2beb391e2660291e9d48e912359eb33db9d2ccd1906d806f
7
+ data.tar.gz: 2780a4c3f6ac1fc7f0ab06e658a293b7e9a7d3f5a2df0a0904fa20aea187857c9f3bad7cc7adfb6f1eba4c57897f433c03d77a958559e09225b60c154c9488b3
@@ -8,20 +8,20 @@ output_mode = :sentences
8
8
  ## command line option processing
9
9
  require 'optparse'
10
10
  optparse = OptionParser.new do |opts|
11
- opts.banner = "Usage: text_sentencer [options]"
12
-
13
- opts.on('-c', '--config=config_filename', 'specifies the configuration JSON file.') do |c|
14
- config_filename = c
15
- end
16
-
17
- opts.on('-j', '--json_output', 'outputs the result in JSON.') do
18
- output_mode = :json
19
- end
20
-
21
- opts.on('-h', '--help', 'displays this screen.') do
22
- puts opts
23
- exit
24
- end
11
+ opts.banner = "Usage: text_sentencer [options]"
12
+
13
+ opts.on('-c', '--config=config_filename', 'specifies the configuration JSON file.') do |c|
14
+ config_filename = c
15
+ end
16
+
17
+ opts.on('-j', '--json_output', 'outputs the result in JSON.') do
18
+ output_mode = :json
19
+ end
20
+
21
+ opts.on('-h', '--help', 'displays this screen.') do
22
+ puts opts
23
+ exit
24
+ end
25
25
  end
26
26
 
27
27
  optparse.parse!
@@ -36,10 +36,10 @@ text = ARGF.read
36
36
  annotations = sentencer.annotate(text)
37
37
 
38
38
  if output_mode == :json
39
- puts JSON.pretty_generate(annotations)
39
+ puts JSON.pretty_generate(annotations)
40
40
  else
41
- annotations[:denotations].each do |d|
42
- span = d[:span]
43
- puts text[span[:begin]...span[:end]]
44
- end
41
+ annotations[:denotations].each do |d|
42
+ span = d[:span]
43
+ puts text[span[:begin]...span[:end]]
44
+ end
45
45
  end
@@ -1,9 +1,9 @@
1
1
  class String
2
- def scan_offset(regex)
3
- Enumerator.new do |y|
4
- self.scan(regex) do
5
- y << Regexp.last_match
6
- end
7
- end
8
- end
2
+ def scan_offset(regex)
3
+ Enumerator.new do |y|
4
+ self.scan(regex) do
5
+ y << Regexp.last_match
6
+ end
7
+ end
8
+ end
9
9
  end
@@ -3,115 +3,115 @@ require 'text_sentencer/string_scan_offset'
3
3
  require 'pp'
4
4
 
5
5
  class TextSentencer
6
- ## default rules
7
-
8
- DEFAULT_RULES = {
9
- # All the positions of new line characters always take sentence break.
10
- break_pattern: "([ \t]*\n+)+[ \t]*",
11
-
12
- # All the positions of space and tab characters are candiates of sentence break.
13
- candidate_pattern: "[ \t]+",
14
-
15
- # First, positive rules are applied to the break candidates to make initial segmantations.
16
- positive_rules: [
17
- ['[.!?]', '[0-9A-Z]'],
18
- ['[:]', '[0-9]'],
19
- ['[:]', '[A-Z][a-z]']
20
- ],
21
-
22
- # Then, negative rules are applied to cancel some initial segmentations.
23
- negative_rules: [
24
- # Titles before names
25
- ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
26
-
27
- # Titles usually before names, but ..
28
- ['(Sr|Jr)\.', '[A-Z][a-z]'],
29
-
30
- # Single letter abbriveations, e.g. middle name
31
- # ['\b[A-Z]\.', '[A-Z][a-z]'],
32
-
33
- # Abbriveations, e.g. middle name
34
- ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
35
-
36
- # Frequent abbreviations that will never appear in the end of a sentence
37
- ['(cf|vs)\.', ''],
38
- ['e\.g\.', ''],
39
- ['i\.e\.', ''],
40
-
41
- # Others
42
- ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
43
- ]
44
- }
45
-
46
- def initialize(rules = nil)
47
- rules ||= DEFAULT_RULES
48
- @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
49
- @rules[:break_pattern] ||= ""
50
- @rules[:candidate_pattern] ||= ""
51
- @rules[:positive_rules] ||= []
52
- @rules[:negative_rules] ||= []
53
- end
54
-
55
- def annotate(text)
56
- return nil if text.nil?
57
-
58
- sentences = segment(text)
59
- denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
60
- {text:text, denotations:denotations}
61
- end
62
-
63
- def segment(text)
64
- breaks = if @rules[:break_pattern].empty?
65
- []
66
- else
67
- text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)}
68
- end
69
-
70
- candidates = if @rules[:candidate_pattern].empty?
71
- []
72
- else
73
- text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)}
74
- end
75
-
76
- # breaks take precedent
77
- candidates -= breaks
78
-
79
- candidates.each do |c|
80
- last_end, next_begin = c
81
-
82
- if (last_end == 0) || (next_begin == text.length)
83
- breaks << c
84
- next
85
- end
86
-
87
- last_text = text[0...last_end]
88
- next_text = text[next_begin..-1]
89
-
90
- @rules[:positive_rules].each do |p|
91
- if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/)
92
- break_p = true
93
- @rules[:negative_rules].each do |n|
94
- if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/)
95
- break_p = false
96
- break
97
- end
98
- end
99
- breaks << c if break_p
100
- break
101
- end
102
- end
103
- end
104
-
105
- breaks.sort!
106
-
107
- sentences = []
108
- lastbreak = 0
109
- breaks.each do |b|
110
- sentences << [lastbreak, b[0]] if b[0] > lastbreak
111
- lastbreak = b[1]
112
- end
113
- sentences << [lastbreak, text.length] if lastbreak < text.length
114
-
115
- sentences
116
- end
6
+ ## default rules
7
+
8
+ DEFAULT_RULES = {
9
+ # All the positions of new line characters always take sentence break.
10
+ break_pattern: "([ \t]*\n+)+[ \t]*",
11
+
12
+ # All the positions of space and tab characters are candiates of sentence break.
13
+ candidate_pattern: "[ \t]+",
14
+
15
+ # First, positive rules are applied to the break candidates to make initial segmantations.
16
+ positive_rules: [
17
+ ['[.!?]', '[0-9A-Z]'],
18
+ ['[:]', '[0-9]'],
19
+ ['[:]', '[A-Z][a-z]']
20
+ ],
21
+
22
+ # Then, negative rules are applied to cancel some initial segmentations.
23
+ negative_rules: [
24
+ # Titles before names
25
+ ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
26
+
27
+ # Titles usually before names, but ..
28
+ ['(Sr|Jr)\.', '[A-Z][a-z]'],
29
+
30
+ # Single letter abbriveations, e.g. middle name
31
+ # ['\b[A-Z]\.', '[A-Z][a-z]'],
32
+
33
+ # Abbriveations, e.g. middle name
34
+ ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
35
+
36
+ # Frequent abbreviations that will never appear in the end of a sentence
37
+ ['(cf|vs)\.', ''],
38
+ ['e\.g\.', ''],
39
+ ['i\.e\.', ''],
40
+
41
+ # Others
42
+ ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
43
+ ]
44
+ }
45
+
46
+ def initialize(rules = nil)
47
+ rules ||= DEFAULT_RULES
48
+ @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
49
+ @rules[:break_pattern] ||= ""
50
+ @rules[:candidate_pattern] ||= ""
51
+ @rules[:positive_rules] ||= []
52
+ @rules[:negative_rules] ||= []
53
+ end
54
+
55
+ def annotate(text)
56
+ return nil if text.nil?
57
+
58
+ sentences = segment(text)
59
+ denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
60
+ {text:text, denotations:denotations}
61
+ end
62
+
63
+ def segment(text)
64
+ breaks = if @rules[:break_pattern].empty?
65
+ []
66
+ else
67
+ text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)}
68
+ end
69
+
70
+ candidates = if @rules[:candidate_pattern].empty?
71
+ []
72
+ else
73
+ text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)}
74
+ end
75
+
76
+ # breaks take precedent
77
+ candidates -= breaks
78
+
79
+ candidates.each do |c|
80
+ last_end, next_begin = c
81
+
82
+ if (last_end == 0) || (next_begin == text.length)
83
+ breaks << c
84
+ next
85
+ end
86
+
87
+ last_text = text[0...last_end]
88
+ next_text = text[next_begin..-1]
89
+
90
+ @rules[:positive_rules].each do |p|
91
+ if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/)
92
+ break_p = true
93
+ @rules[:negative_rules].each do |n|
94
+ if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/)
95
+ break_p = false
96
+ break
97
+ end
98
+ end
99
+ breaks << c if break_p
100
+ break
101
+ end
102
+ end
103
+ end
104
+
105
+ breaks.sort!
106
+
107
+ sentences = []
108
+ lastbreak = 0
109
+ breaks.each do |b|
110
+ sentences << [lastbreak, b[0]] if b[0] > lastbreak
111
+ lastbreak = b[1]
112
+ end
113
+ sentences << [lastbreak, text.length] if lastbreak < text.length
114
+
115
+ sentences
116
+ end
117
117
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_sentencer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-09-07 00:00:00.000000000 Z
11
+ date: 2021-01-17 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: TextSentencer is a simple rule-based system for segmenting text into
14
14
  sentences.
@@ -41,8 +41,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
41
41
  - !ruby/object:Gem::Version
42
42
  version: '0'
43
43
  requirements: []
44
- rubyforge_project:
45
- rubygems_version: 2.4.8
44
+ rubygems_version: 3.2.3
46
45
  signing_key:
47
46
  specification_version: 4
48
47
  summary: A simple, rule-based script to find sentence boundaries in text.