pragmatic_segmenter 0.3.20 → 0.3.21
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/NEWS +5 -0
- data/README.md +4 -0
- data/lib/pragmatic_segmenter/languages/common.rb +2 -0
- data/lib/pragmatic_segmenter/languages/common/numbers.rb +2 -0
- data/lib/pragmatic_segmenter/processor.rb +6 -0
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter/languages/english_spec.rb +36 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a13bc873b763cc68e4bfebcd0c291d3eab12a429
|
4
|
+
data.tar.gz: 85666c4d354ba1a0e5118c0e3e23b9c4d1978ae4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f3c211daa3aaf71d4ff1d363ee15711c8979db6fa3cf9fb87d84108466f0d8fe9ee7b9694ad32a6439a79c471ab013e70f536601d951a7e9400addb954f35c76
|
7
|
+
data.tar.gz: de40f070a216d90cff094bb6cfdb83b1f1c108774ff67da18e58846fc69aab78decc234f0b0c7fda7931ffcc7e8ca1e427d28fd495922a28aa91df02981917e6
|
data/NEWS
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
0.3.21 (2018-08-30):
|
2
|
+
|
3
|
+
* Improvement: Add support for file formats
|
4
|
+
* Improvement: Add support for numeric references at the end of a sentence (i.e. Wikipedia references)
|
5
|
+
|
1
6
|
0.3.20 (2018-08-28):
|
2
7
|
|
3
8
|
* Improvement: Handle slanted single quotation as a single quote
|
data/README.md
CHANGED
@@ -874,6 +874,10 @@ To test the relative performance of different segmentation tools and libraries I
|
|
874
874
|
* Add support for Chinese caret brackets
|
875
875
|
* Add viz as abbreviation
|
876
876
|
|
877
|
+
**Version 0.3.21**
|
878
|
+
* Add support for file formats
|
879
|
+
* Add support for numeric references at the end of a sentence (i.e. Wikipedia references)
|
880
|
+
|
877
881
|
## Contributing
|
878
882
|
|
879
883
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -24,6 +24,8 @@ module PragmaticSegmenter
|
|
24
24
|
# Rubular: http://rubular.com/r/G2opjedIm9
|
25
25
|
GeoLocationRule = Rule.new(/(?<=[a-zA-z]°)\.(?=\s*\d+)/, '∯')
|
26
26
|
|
27
|
+
FileFormatRule = Rule.new(/(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)/, '∯')
|
28
|
+
|
27
29
|
SingleNewLineRule = Rule.new(/\n/, 'ȹ')
|
28
30
|
|
29
31
|
module DoublePunctuationRules
|
@@ -47,6 +47,8 @@ module PragmaticSegmenter
|
|
47
47
|
# Rubular: http://rubular.com/r/mQ8Es9bxtk
|
48
48
|
CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
|
49
49
|
|
50
|
+
NUMBERED_REFERENCE_REGEX = /(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)*\b\d{1,3}\])+|((\d{1,3}\s?)*\d{1,3}))(\s)(?=[A-Z])/
|
51
|
+
|
50
52
|
# Rubular: http://rubular.com/r/yqa4Rit8EY
|
51
53
|
PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
|
52
54
|
|
@@ -23,8 +23,10 @@ module PragmaticSegmenter
|
|
23
23
|
replace_abbreviations
|
24
24
|
replace_numbers
|
25
25
|
replace_continuous_punctuation
|
26
|
+
replace_periods_before_numeric_references
|
26
27
|
@text.apply(@language::Abbreviations::WithMultiplePeriodsAndEmailRule)
|
27
28
|
@text.apply(@language::GeoLocationRule)
|
29
|
+
@text.apply(@language::FileFormatRule)
|
28
30
|
split_into_segments
|
29
31
|
end
|
30
32
|
|
@@ -68,6 +70,10 @@ module PragmaticSegmenter
|
|
68
70
|
end
|
69
71
|
end
|
70
72
|
|
73
|
+
def replace_periods_before_numeric_references
|
74
|
+
@text.gsub!(@language::NUMBERED_REFERENCE_REGEX, "∯\\2\r\\7")
|
75
|
+
end
|
76
|
+
|
71
77
|
def consecutive_underscore?(txt)
|
72
78
|
# Rubular: http://rubular.com/r/fTF2Ff3WBL
|
73
79
|
txt.gsub(/_{3,}/, '').length.eql?(0)
|
@@ -1426,5 +1426,41 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
|
|
1426
1426
|
ps = PragmaticSegmenter::Segmenter.new(text: "Here’s the - ahem - official citation: Baker, C., Anderson, Kenneth, Martin, James, & Palen, Leysia. Modeling Open Source Software Communities, ProQuest Dissertations and Theses.")
|
1427
1427
|
expect(ps.segment).to eq(["Here’s the - ahem - official citation: Baker, C., Anderson, Kenneth, Martin, James, & Palen, Leysia.", "Modeling Open Source Software Communities, ProQuest Dissertations and Theses."])
|
1428
1428
|
end
|
1429
|
+
|
1430
|
+
it 'correctly segments text #122' do
|
1431
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "These include images of various modes of transport and members of the team, all available in .jpeg format. Images can be downloaded from our website. We also offer archives as .zip files.")
|
1432
|
+
expect(ps.segment).to eq(["These include images of various modes of transport and members of the team, all available in .jpeg format.", "Images can be downloaded from our website.", "We also offer archives as .zip files."])
|
1433
|
+
end
|
1434
|
+
|
1435
|
+
it 'correctly segments text #123' do
|
1436
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Saint Maximus (died 250) is a Christian saint and martyr.[1] The emperor Decius published a decree ordering the veneration of busts of the deified emperors.")
|
1437
|
+
expect(ps.segment).to eq(["Saint Maximus (died 250) is a Christian saint and martyr.[1]", "The emperor Decius published a decree ordering the veneration of busts of the deified emperors."])
|
1438
|
+
end
|
1439
|
+
|
1440
|
+
it 'correctly segments text #124' do
|
1441
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Differing agendas can potentially create an understanding gap in a consultation.11 12 Take the example of one of the most common presentations in ill health: the common cold.")
|
1442
|
+
expect(ps.segment).to eq(["Differing agendas can potentially create an understanding gap in a consultation.11 12", "Take the example of one of the most common presentations in ill health: the common cold."])
|
1443
|
+
end
|
1444
|
+
|
1445
|
+
it 'correctly segments text #125' do
|
1446
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Daniel Kahneman popularised the concept of fast and slow thinking: the distinction between instinctive (type 1 thinking) and reflective, analytical cognition (type 2).10 This model relates to doctors achieving a balance between efficiency and effectiveness.")
|
1447
|
+
expect(ps.segment).to eq(["Daniel Kahneman popularised the concept of fast and slow thinking: the distinction between instinctive (type 1 thinking) and reflective, analytical cognition (type 2).10", "This model relates to doctors achieving a balance between efficiency and effectiveness."])
|
1448
|
+
end
|
1449
|
+
|
1450
|
+
it 'correctly segments text #126' do
|
1451
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Its traditional use[1] is well documented in the ethnobotanical literature [2–11]. Leaves, buds, tar and essential oils are used to treat a wide spectrum of diseases.")
|
1452
|
+
expect(ps.segment).to eq(["Its traditional use[1] is well documented in the ethnobotanical literature [2–11].", "Leaves, buds, tar and essential oils are used to treat a wide spectrum of diseases."])
|
1453
|
+
end
|
1454
|
+
|
1455
|
+
it 'correctly segments text #127' do
|
1456
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Thus increasing the desire for political reform both in Lancashire and in the country at large.[7][8] This was a serious misdemeanour,[16] encouraging them to declare the assembly illegal as soon as it was announced on 31 July.[17][18] The radicals sought a second opinion on the meeting's legality.")
|
1457
|
+
expect(ps.segment).to eq(["Thus increasing the desire for political reform both in Lancashire and in the country at large.[7][8]", "This was a serious misdemeanour,[16] encouraging them to declare the assembly illegal as soon as it was announced on 31 July.[17][18]", "The radicals sought a second opinion on the meeting's legality."])
|
1458
|
+
end
|
1459
|
+
|
1460
|
+
it 'correctly segments text #128' do
|
1461
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "The table in (4) is a sample from the Wall Street Journal (1987).1 According to the distribution all the pairs given in (4) count as candidates for abbreviations.")
|
1462
|
+
expect(ps.segment).to eq([ "The table in (4) is a sample from the Wall Street Journal (1987).1", "According to the distribution all the pairs given in (4) count as candidates for abbreviations."])
|
1463
|
+
|
1464
|
+
end
|
1429
1465
|
end
|
1430
1466
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.21
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-08-
|
11
|
+
date: 2018-08-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|