text_sentencer 1.0.3 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b8dda6323a11356aa2f6c78fa0281658ea677f09fbf0bfb19b19330f732e72c1
4
- data.tar.gz: a41e5cc5ae3e1e294e9ba4921993e2d98003d74435fd5bc099c6c6fbf188554a
3
+ metadata.gz: 13537fd663911db81cc5d4068c4c3d8439d332c4742e9d027c6687031f44f32f
4
+ data.tar.gz: dbd6aacd4e7dfca434c27608fa747bb7fbd8d43bf62af682f0e55784fcccf748
5
5
  SHA512:
6
- metadata.gz: b39ec569c1e988f4e7936385395ff1e34c15cb7721ffba6fb2453f633db360f092b5225d6ec4603b2beb391e2660291e9d48e912359eb33db9d2ccd1906d806f
7
- data.tar.gz: 2780a4c3f6ac1fc7f0ab06e658a293b7e9a7d3f5a2df0a0904fa20aea187857c9f3bad7cc7adfb6f1eba4c57897f433c03d77a958559e09225b60c154c9488b3
6
+ metadata.gz: 9758707a8204ebe212e699f2f3dd2011d3ad0fdf355cde4c85df5521d17aa6763df593915b88890aad9c29c10903bbada40be9ec66cc082305fdf5b614c252bb
7
+ data.tar.gz: '09938a12984f7d33c6337adcaf59af9087ce2969d5d98b37b3e73d833d015a49950b6d7b65bc845dc0820727ea4d2b5af61a8bb0af7ce5201b7531748f8772a8'
data/bin/text_sentencer CHANGED
@@ -38,8 +38,10 @@ annotations = sentencer.annotate(text)
38
38
  if output_mode == :json
39
39
  puts JSON.pretty_generate(annotations)
40
40
  else
41
- annotations[:denotations].each do |d|
42
- span = d[:span]
43
- puts text[span[:begin]...span[:end]]
41
+ if annotations.has_key?(:blocks)
42
+ annotations[:blocks].each do |d|
43
+ span = d[:span]
44
+ puts '[' + text[span[:begin]...span[:end]] + ']'
45
+ end
44
46
  end
45
47
  end
@@ -7,111 +7,101 @@ class TextSentencer
7
7
 
8
8
  DEFAULT_RULES = {
9
9
  # All the positions of new line characters always take sentence break.
10
- break_pattern: "([ \t]*\n+)+[ \t]*",
10
+ break_pattern: /([ \t]*\n+)+[ \t]*/,
11
11
 
12
12
  # All the positions of space and tab characters are candiates of sentence break.
13
- candidate_pattern: "[ \t]+",
13
+ candidate_pattern: /[ \t\n]+/,
14
14
 
15
15
  # First, positive rules are applied to the break candidates to make initial segmantations.
16
16
  positive_rules: [
17
- ['[.!?]', '[0-9A-Z]'],
18
- ['[:]', '[0-9]'],
19
- ['[:]', '[A-Z][a-z]']
17
+ [/[.!?]\z/, /\A[0-9A-Z]/],
18
+ [/[:]\z/, /\A[0-9]/],
19
+ [/[:]\z/, /\A[A-Z][a-z]/]
20
20
  ],
21
21
 
22
22
  # Then, negative rules are applied to cancel some initial segmentations.
23
23
  negative_rules: [
24
24
  # Titles before names
25
- ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
25
+ [/(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.\z/, /\A[A-Z][a-z]/],
26
26
 
27
27
  # Titles usually before names, but ..
28
- ['(Sr|Jr)\.', '[A-Z][a-z]'],
28
+ [/(Sr|Jr)\.\z/, /\A[A-Z][a-z]/],
29
29
 
30
30
  # Single letter abbriveations, e.g. middle name
31
- # ['\b[A-Z]\.', '[A-Z][a-z]'],
31
+ # [/\b[A-Z]\.\z/, /\A[A-Z][a-z]/],
32
32
 
33
33
  # Abbriveations, e.g. middle name
34
- ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
34
+ [/\b[A-Z][a-z]*\.\z/, /\A[0-9A-Z]/],
35
35
 
36
36
  # Frequent abbreviations that will never appear in the end of a sentence
37
- ['(cf|vs)\.', ''],
38
- ['e\.g\.', ''],
39
- ['i\.e\.', ''],
37
+ [/(cf|vs)\.\z/, //],
38
+ [/e\.g\.\z/, //],
39
+ [/i\.e\.\z/, //],
40
40
 
41
41
  # Others
42
- ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
42
+ [/(Sec|Chap|Fig|Eq)\.\z/, /\A[0-9A-Z]/]
43
43
  ]
44
44
  }
45
45
 
46
46
  def initialize(rules = nil)
47
47
  rules ||= DEFAULT_RULES
48
48
  @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
49
- @rules[:break_pattern] ||= ""
50
- @rules[:candidate_pattern] ||= ""
49
+ @rules[:break_pattern] ||= nil
50
+ @rules[:candidate_pattern] ||= nil
51
51
  @rules[:positive_rules] ||= []
52
52
  @rules[:negative_rules] ||= []
53
53
  end
54
54
 
55
55
  def annotate(text)
56
- return nil if text.nil?
57
-
58
56
  sentences = segment(text)
59
- denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
60
- {text:text, denotations:denotations}
57
+ blocks = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
58
+ {text:text, blocks:blocks}
61
59
  end
62
60
 
63
61
  def segment(text)
64
- breaks = if @rules[:break_pattern].empty?
62
+ # hard sentence breaks
63
+ breaks = if @rules[:break_pattern].nil?
65
64
  []
66
65
  else
67
- text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)}
66
+ text.scan_offset(@rules[:break_pattern]).map{|m| m.offset(0)}
68
67
  end
69
68
 
70
- candidates = if @rules[:candidate_pattern].empty?
69
+ # candidates of sentence breaks
70
+ candidates = if @rules[:candidate_pattern].nil?
71
71
  []
72
72
  else
73
- text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)}
73
+ text.scan_offset(@rules[:candidate_pattern]).map{|m| m.offset(0)}
74
74
  end
75
75
 
76
- # breaks take precedent
76
+ # hard sentence breaks are already in
77
77
  candidates -= breaks
78
78
 
79
- candidates.each do |c|
80
- last_end, next_begin = c
81
-
82
- if (last_end == 0) || (next_begin == text.length)
83
- breaks << c
84
- next
79
+ # filter using the positive rules. The first and last breaks remain.
80
+ candidates.select! do |c|
81
+ c[0] == 0 ||
82
+ c[1] == text.length ||
83
+ @rules[:positive_rules].any? do |r|
84
+ (text[0...c[0]] =~ r[0]) && (text[c[1]...-1] =~ r[1])
85
85
  end
86
+ end
86
87
 
87
- last_text = text[0...last_end]
88
- next_text = text[next_begin..-1]
89
-
90
- @rules[:positive_rules].each do |p|
91
- if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/)
92
- break_p = true
93
- @rules[:negative_rules].each do |n|
94
- if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/)
95
- break_p = false
96
- break
97
- end
98
- end
99
- breaks << c if break_p
100
- break
101
- end
88
+ # filter using the negative rules
89
+ candidates.reject! do |c|
90
+ @rules[:negative_rules].any? do |r|
91
+ (text[0...c[0]] =~ r[0]) && (text[c[1]...-1] =~ r[1])
102
92
  end
103
93
  end
104
94
 
95
+ # add all the filtered candidates to sentence breaks
96
+ breaks += candidates
97
+ breaks.uniq!
105
98
  breaks.sort!
106
99
 
107
- sentences = []
108
- lastbreak = 0
109
- breaks.each do |b|
110
- sentences << [lastbreak, b[0]] if b[0] > lastbreak
111
- lastbreak = b[1]
112
- end
113
- sentences << [lastbreak, text.length] if lastbreak < text.length
100
+ # add the initial and final breaks unless already exist
101
+ breaks.unshift([0, 0]) if breaks.empty? || breaks[0][0] != 0
102
+ breaks.push([text.length, text.length]) unless breaks[-1][1] == text.length
114
103
 
115
- sentences
104
+ # conver the breaks into sentences
105
+ sentences = (1 ... breaks.length).map{|i| [breaks[i-1][1], breaks[i][0]]}
116
106
  end
117
107
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_sentencer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-17 00:00:00.000000000 Z
11
+ date: 2024-08-03 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: TextSentencer is a simple rule-based system for segmenting text into
14
14
  sentences.
@@ -41,7 +41,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
41
41
  - !ruby/object:Gem::Version
42
42
  version: '0'
43
43
  requirements: []
44
- rubygems_version: 3.2.3
44
+ rubygems_version: 3.5.11
45
45
  signing_key:
46
46
  specification_version: 4
47
47
  summary: A simple, rule-based script to find sentence boundaries in text.