text_sentencer 1.0.3 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b8dda6323a11356aa2f6c78fa0281658ea677f09fbf0bfb19b19330f732e72c1
4
- data.tar.gz: a41e5cc5ae3e1e294e9ba4921993e2d98003d74435fd5bc099c6c6fbf188554a
3
+ metadata.gz: 13537fd663911db81cc5d4068c4c3d8439d332c4742e9d027c6687031f44f32f
4
+ data.tar.gz: dbd6aacd4e7dfca434c27608fa747bb7fbd8d43bf62af682f0e55784fcccf748
5
5
  SHA512:
6
- metadata.gz: b39ec569c1e988f4e7936385395ff1e34c15cb7721ffba6fb2453f633db360f092b5225d6ec4603b2beb391e2660291e9d48e912359eb33db9d2ccd1906d806f
7
- data.tar.gz: 2780a4c3f6ac1fc7f0ab06e658a293b7e9a7d3f5a2df0a0904fa20aea187857c9f3bad7cc7adfb6f1eba4c57897f433c03d77a958559e09225b60c154c9488b3
6
+ metadata.gz: 9758707a8204ebe212e699f2f3dd2011d3ad0fdf355cde4c85df5521d17aa6763df593915b88890aad9c29c10903bbada40be9ec66cc082305fdf5b614c252bb
7
+ data.tar.gz: '09938a12984f7d33c6337adcaf59af9087ce2969d5d98b37b3e73d833d015a49950b6d7b65bc845dc0820727ea4d2b5af61a8bb0af7ce5201b7531748f8772a8'
data/bin/text_sentencer CHANGED
@@ -38,8 +38,10 @@ annotations = sentencer.annotate(text)
38
38
  if output_mode == :json
39
39
  puts JSON.pretty_generate(annotations)
40
40
  else
41
- annotations[:denotations].each do |d|
42
- span = d[:span]
43
- puts text[span[:begin]...span[:end]]
41
+ if annotations.has_key?(:blocks)
42
+ annotations[:blocks].each do |d|
43
+ span = d[:span]
44
+ puts '[' + text[span[:begin]...span[:end]] + ']'
45
+ end
44
46
  end
45
47
  end
@@ -7,111 +7,101 @@ class TextSentencer
7
7
 
8
8
  DEFAULT_RULES = {
9
9
  # All the positions of new line characters always take sentence break.
10
- break_pattern: "([ \t]*\n+)+[ \t]*",
10
+ break_pattern: /([ \t]*\n+)+[ \t]*/,
11
11
 
12
12
  # All the positions of space and tab characters are candiates of sentence break.
13
- candidate_pattern: "[ \t]+",
13
+ candidate_pattern: /[ \t\n]+/,
14
14
 
15
15
  # First, positive rules are applied to the break candidates to make initial segmantations.
16
16
  positive_rules: [
17
- ['[.!?]', '[0-9A-Z]'],
18
- ['[:]', '[0-9]'],
19
- ['[:]', '[A-Z][a-z]']
17
+ [/[.!?]\z/, /\A[0-9A-Z]/],
18
+ [/[:]\z/, /\A[0-9]/],
19
+ [/[:]\z/, /\A[A-Z][a-z]/]
20
20
  ],
21
21
 
22
22
  # Then, negative rules are applied to cancel some initial segmentations.
23
23
  negative_rules: [
24
24
  # Titles before names
25
- ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
25
+ [/(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.\z/, /\A[A-Z][a-z]/],
26
26
 
27
27
  # Titles usually before names, but ..
28
- ['(Sr|Jr)\.', '[A-Z][a-z]'],
28
+ [/(Sr|Jr)\.\z/, /\A[A-Z][a-z]/],
29
29
 
30
30
  # Single letter abbriveations, e.g. middle name
31
- # ['\b[A-Z]\.', '[A-Z][a-z]'],
31
+ # [/\b[A-Z]\.\z/, /\A[A-Z][a-z]/],
32
32
 
33
33
  # Abbriveations, e.g. middle name
34
- ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
34
+ [/\b[A-Z][a-z]*\.\z/, /\A[0-9A-Z]/],
35
35
 
36
36
  # Frequent abbreviations that will never appear in the end of a sentence
37
- ['(cf|vs)\.', ''],
38
- ['e\.g\.', ''],
39
- ['i\.e\.', ''],
37
+ [/(cf|vs)\.\z/, //],
38
+ [/e\.g\.\z/, //],
39
+ [/i\.e\.\z/, //],
40
40
 
41
41
  # Others
42
- ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
42
+ [/(Sec|Chap|Fig|Eq)\.\z/, /\A[0-9A-Z]/]
43
43
  ]
44
44
  }
45
45
 
46
46
  def initialize(rules = nil)
47
47
  rules ||= DEFAULT_RULES
48
48
  @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
49
- @rules[:break_pattern] ||= ""
50
- @rules[:candidate_pattern] ||= ""
49
+ @rules[:break_pattern] ||= nil
50
+ @rules[:candidate_pattern] ||= nil
51
51
  @rules[:positive_rules] ||= []
52
52
  @rules[:negative_rules] ||= []
53
53
  end
54
54
 
55
55
  def annotate(text)
56
- return nil if text.nil?
57
-
58
56
  sentences = segment(text)
59
- denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
60
- {text:text, denotations:denotations}
57
+ blocks = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
58
+ {text:text, blocks:blocks}
61
59
  end
62
60
 
63
61
  def segment(text)
64
- breaks = if @rules[:break_pattern].empty?
62
+ # hard sentence breaks
63
+ breaks = if @rules[:break_pattern].nil?
65
64
  []
66
65
  else
67
- text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)}
66
+ text.scan_offset(@rules[:break_pattern]).map{|m| m.offset(0)}
68
67
  end
69
68
 
70
- candidates = if @rules[:candidate_pattern].empty?
69
+ # candidates of sentence breaks
70
+ candidates = if @rules[:candidate_pattern].nil?
71
71
  []
72
72
  else
73
- text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)}
73
+ text.scan_offset(@rules[:candidate_pattern]).map{|m| m.offset(0)}
74
74
  end
75
75
 
76
- # breaks take precedent
76
+ # hard sentence breaks are already in
77
77
  candidates -= breaks
78
78
 
79
- candidates.each do |c|
80
- last_end, next_begin = c
81
-
82
- if (last_end == 0) || (next_begin == text.length)
83
- breaks << c
84
- next
79
+ # filter using the positive rules. The first and last breaks remain.
80
+ candidates.select! do |c|
81
+ c[0] == 0 ||
82
+ c[1] == text.length ||
83
+ @rules[:positive_rules].any? do |r|
84
+ (text[0...c[0]] =~ r[0]) && (text[c[1]...-1] =~ r[1])
85
85
  end
86
+ end
86
87
 
87
- last_text = text[0...last_end]
88
- next_text = text[next_begin..-1]
89
-
90
- @rules[:positive_rules].each do |p|
91
- if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/)
92
- break_p = true
93
- @rules[:negative_rules].each do |n|
94
- if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/)
95
- break_p = false
96
- break
97
- end
98
- end
99
- breaks << c if break_p
100
- break
101
- end
88
+ # filter using the negative rules
89
+ candidates.reject! do |c|
90
+ @rules[:negative_rules].any? do |r|
91
+ (text[0...c[0]] =~ r[0]) && (text[c[1]...-1] =~ r[1])
102
92
  end
103
93
  end
104
94
 
95
+ # add all the filtered candidates to sentence breaks
96
+ breaks += candidates
97
+ breaks.uniq!
105
98
  breaks.sort!
106
99
 
107
- sentences = []
108
- lastbreak = 0
109
- breaks.each do |b|
110
- sentences << [lastbreak, b[0]] if b[0] > lastbreak
111
- lastbreak = b[1]
112
- end
113
- sentences << [lastbreak, text.length] if lastbreak < text.length
100
+ # add the initial and final breaks unless already exist
101
+ breaks.unshift([0, 0]) if breaks.empty? || breaks[0][0] != 0
102
+ breaks.push([text.length, text.length]) unless breaks[-1][1] == text.length
114
103
 
115
- sentences
104
+ # conver the breaks into sentences
105
+ sentences = (1 ... breaks.length).map{|i| [breaks[i-1][1], breaks[i][0]]}
116
106
  end
117
107
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_sentencer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-17 00:00:00.000000000 Z
11
+ date: 2024-08-03 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: TextSentencer is a simple rule-based system for segmenting text into
14
14
  sentences.
@@ -41,7 +41,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
41
41
  - !ruby/object:Gem::Version
42
42
  version: '0'
43
43
  requirements: []
44
- rubygems_version: 3.2.3
44
+ rubygems_version: 3.5.11
45
45
  signing_key:
46
46
  specification_version: 4
47
47
  summary: A simple, rule-based script to find sentence boundaries in text.