scalpel 0.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/scalpel.rb +6 -2
- metadata +2 -2
data/lib/scalpel.rb
CHANGED
@@ -8,7 +8,7 @@
|
|
8
8
|
class Scalpel
|
9
9
|
|
10
10
|
# Current version.
|
11
|
-
VERSION = '0.2'
|
11
|
+
VERSION = '0.2.1'
|
12
12
|
|
13
13
|
# Segment a text using the Scalpel algorithm.
|
14
14
|
# This will eventually be ported to a gem.
|
@@ -22,6 +22,8 @@ class Scalpel
|
|
22
22
|
text.gsub!('...', '&;&.')
|
23
23
|
# Remove floating point numbers.
|
24
24
|
text.gsub!(/([0-9]+)\.([0-9]+)/) { $1 + '&@&' + $2 }
|
25
|
+
# Handle floats without leading zero.
|
26
|
+
text.gsub!(/\s\.([0-9]+)/) { ' &#&' + $1 }
|
25
27
|
# Remove abbreviations.
|
26
28
|
text.gsub!(/(?:[A-Za-z]\.){2,}/) { |abbr| abbr.gsub('.', '&-&') }
|
27
29
|
# Remove titles.
|
@@ -63,7 +65,9 @@ class Scalpel
|
|
63
65
|
sentence.gsub!(/&%&([.!?])/) { $1 + "'" }
|
64
66
|
sentence.gsub!(/&\^&([.?!])/) { "'" + $1 + '"' }
|
65
67
|
sentence.gsub!(/&\*&([.?!])/) { "'" + $1 + '”' }
|
66
|
-
sentence.gsub!(
|
68
|
+
sentence.gsub!(/&\$&([.!?])/) { $1 + '"' }
|
69
|
+
# Repair floats without leading zeros.
|
70
|
+
sentence.gsub!(/&#&([0-9]+)/) { '.' + $1 }
|
67
71
|
results << sentence.strip
|
68
72
|
end
|
69
73
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scalpel
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.2.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-12-21 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: ! ' Scalpel is a sentence segmentation tool for Ruby. It allows you to
|
15
15
|
split a text into an array of sentences. It is simple, lightweight, blazing fast
|