pragmatic_segmenter 0.3.19 → 0.3.20
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/NEWS +7 -0
- data/README.md +6 -0
- data/lib/pragmatic_segmenter/between_punctuation.rb +10 -0
- data/lib/pragmatic_segmenter/languages/chinese.rb +26 -0
- data/lib/pragmatic_segmenter/languages/common.rb +1 -1
- data/lib/pragmatic_segmenter/languages/common/numbers.rb +2 -2
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter/languages/chinese_spec.rb +5 -0
- data/spec/pragmatic_segmenter/languages/english_spec.rb +15 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 49b8f9ed555c4c3adba18700c7a910e3b67083aa
|
4
|
+
data.tar.gz: 0907aa732977255028708a162eece4e6225634da
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cfe1b18483d7fbe83d4bfe52971ab7ca95321e89c375e06d1c0e5686c7f2818930cf842a2963fb69bb08b213e6c5480a1fe1171610a8d08629089b028e5899f2
|
7
|
+
data.tar.gz: 4b0d8fdd97ce03c6e205e3c47312486e157ffa926e8ddb50f1e825ae75327379b3aee0dbbc61322bb15f1594535aecbf839a5842893f82d9a74caf2389a7f065
|
data/NEWS
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
0.3.20 (2018-08-28):
|
2
|
+
|
3
|
+
* Improvement: Handle slanted single quotation as a single quote
|
4
|
+
* Bug Fix: The text contains a single character abbreviation as part of a list
|
5
|
+
* Bug Fix: Chinese book quotes
|
6
|
+
* Improvement: Add viz as abbreviation
|
7
|
+
|
1
8
|
0.3.19 (2018-07-19):
|
2
9
|
|
3
10
|
* Bug Fix: A parenthetical following an abbreviation is now included as part of the same segment. Example: "The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)." is now treated as one segment.
|
data/README.md
CHANGED
@@ -868,6 +868,12 @@ To test the relative performance of different segmentation tools and libraries I
|
|
868
868
|
**Version 0.3.19**
|
869
869
|
* Treat a parenthetical following an abbreviation as part of the same segment
|
870
870
|
|
871
|
+
**Version 0.3.20**
|
872
|
+
* Handle slanted single quotation as a single quote
|
873
|
+
* Handle a single character abbreviation as part of a list
|
874
|
+
* Add support for Chinese caret brackets
|
875
|
+
* Add viz as abbreviation
|
876
|
+
|
871
877
|
## Contributing
|
872
878
|
|
873
879
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -8,6 +8,8 @@ module PragmaticSegmenter
|
|
8
8
|
# Rubular: http://rubular.com/r/2YFrKWQUYi
|
9
9
|
BETWEEN_SINGLE_QUOTES_REGEX = /(?<=\s)'(?:[^']|'[a-zA-Z])*'/
|
10
10
|
|
11
|
+
BETWEEN_SINGLE_QUOTE_SLANTED_REGEX = /(?<=\s)‘(?:[^’]|’[a-zA-Z])*’/
|
12
|
+
|
11
13
|
# Rubular: http://rubular.com/r/3Pw1QlXOjd
|
12
14
|
BETWEEN_DOUBLE_QUOTES_REGEX = /"(?>[^"\\]+|\\{2}|\\.)*"/
|
13
15
|
|
@@ -42,6 +44,7 @@ module PragmaticSegmenter
|
|
42
44
|
|
43
45
|
def sub_punctuation_between_quotes_and_parens(txt)
|
44
46
|
sub_punctuation_between_single_quotes(txt)
|
47
|
+
sub_punctuation_between_single_quote_slanted(txt)
|
45
48
|
sub_punctuation_between_double_quotes(txt)
|
46
49
|
sub_punctuation_between_square_brackets(txt)
|
47
50
|
sub_punctuation_between_parens(txt)
|
@@ -74,6 +77,13 @@ module PragmaticSegmenter
|
|
74
77
|
end
|
75
78
|
end
|
76
79
|
|
80
|
+
def sub_punctuation_between_single_quote_slanted(txt)
|
81
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
82
|
+
matches_array: txt.scan(BETWEEN_SINGLE_QUOTE_SLANTED_REGEX),
|
83
|
+
text: txt
|
84
|
+
).replace
|
85
|
+
end
|
86
|
+
|
77
87
|
def sub_punctuation_between_double_quotes(txt)
|
78
88
|
PragmaticSegmenter::PunctuationReplacer.new(
|
79
89
|
matches_array: btwn_dbl_quote(txt),
|
@@ -8,6 +8,32 @@ module PragmaticSegmenter
|
|
8
8
|
class AbbreviationReplacer < AbbreviationReplacer
|
9
9
|
SENTENCE_STARTERS = [].freeze
|
10
10
|
end
|
11
|
+
|
12
|
+
class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
|
13
|
+
BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX = /《(?>[^》\\]+|\\{2}|\\.)*》/
|
14
|
+
BETWEEN_L_BRACKET_REGEX = /「(?>[^」\\]+|\\{2}|\\.)*」/
|
15
|
+
private
|
16
|
+
|
17
|
+
def sub_punctuation_between_quotes_and_parens(txt)
|
18
|
+
super
|
19
|
+
sub_punctuation_between_double_angled_quotation_marks(txt)
|
20
|
+
sub_punctuation_between_l_bracket(txt)
|
21
|
+
end
|
22
|
+
|
23
|
+
def sub_punctuation_between_double_angled_quotation_marks(txt)
|
24
|
+
PunctuationReplacer.new(
|
25
|
+
matches_array: txt.scan(BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX),
|
26
|
+
text: txt
|
27
|
+
).replace
|
28
|
+
end
|
29
|
+
|
30
|
+
def sub_punctuation_between_l_bracket(txt)
|
31
|
+
PunctuationReplacer.new(
|
32
|
+
matches_array: txt.scan(BETWEEN_L_BRACKET_REGEX),
|
33
|
+
text: txt
|
34
|
+
).replace
|
35
|
+
end
|
36
|
+
end
|
11
37
|
end
|
12
38
|
end
|
13
39
|
end
|
@@ -11,7 +11,7 @@ module PragmaticSegmenter
|
|
11
11
|
|
12
12
|
# Defines the abbreviations for each language (if available)
|
13
13
|
module Abbreviation
|
14
|
-
ABBREVIATIONS = Set.new(['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']).freeze
|
14
|
+
ABBREVIATIONS = Set.new(['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']).freeze
|
15
15
|
PREPOSITIVE_ABBREVIATIONS = Set.new(['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']).freeze
|
16
16
|
NUMBER_ABBREVIATIONS = Set.new(['art', 'ext', 'no', 'nos', 'p', 'pp']).freeze
|
17
17
|
end
|
@@ -76,10 +76,10 @@ module PragmaticSegmenter
|
|
76
76
|
# replaces the periods.
|
77
77
|
module SingleLetterAbbreviationRules
|
78
78
|
# Rubular: http://rubular.com/r/e3H6kwnr6H
|
79
|
-
SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(
|
79
|
+
SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=,?\s)/, '∯')
|
80
80
|
|
81
81
|
# Rubular: http://rubular.com/r/gitvf0YWH4
|
82
|
-
SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(
|
82
|
+
SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=,?\s)/, '∯')
|
83
83
|
|
84
84
|
All = [
|
85
85
|
SingleUpperCaseLetterAtStartOfLineRule,
|
@@ -7,5 +7,10 @@ RSpec.describe PragmaticSegmenter::Languages::Chinese, '(zh)' do
|
|
7
7
|
ps = PragmaticSegmenter::Segmenter.new(text: "安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。", language: 'zh')
|
8
8
|
expect(ps.segment).to eq(["安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。", "周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。"])
|
9
9
|
end
|
10
|
+
|
11
|
+
it 'correctly segments text #002' do
|
12
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "我们明天一起去看《摔跤吧!爸爸》好吗?好!", language: 'zh')
|
13
|
+
expect(ps.segment).to eq(["我们明天一起去看《摔跤吧!爸爸》好吗?", "好!"])
|
14
|
+
end
|
10
15
|
end
|
11
16
|
end
|
@@ -1411,5 +1411,20 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
|
|
1411
1411
|
ps = PragmaticSegmenter::Segmenter.new(text: text, clean: false)
|
1412
1412
|
expect(ps.segment).to eq(["The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)."])
|
1413
1413
|
end
|
1414
|
+
|
1415
|
+
it 'correctly segments text #119' do
|
1416
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Unlike the abbreviations i.e. and e.g., viz. is used to indicate a detailed description of something stated before.")
|
1417
|
+
expect(ps.segment).to eq(["Unlike the abbreviations i.e. and e.g., viz. is used to indicate a detailed description of something stated before."])
|
1418
|
+
end
|
1419
|
+
|
1420
|
+
it 'correctly segments text #120' do
|
1421
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "For example, ‘dragonswort… is said that it should be grown in dragon’s blood. It grows at the tops of mountains where there are groves of trees, chiefly in holy places and in the country that is called Apulia’ (translated by Anne Van Arsdall, in Medieval Herbal Remedies: The Old English Herbarium and Anglo-Saxon Medicine p. 154). The Herbal also includes lore about other plants, such as the mandrake.")
|
1422
|
+
expect(ps.segment).to eq(["For example, ‘dragonswort… is said that it should be grown in dragon’s blood. It grows at the tops of mountains where there are groves of trees, chiefly in holy places and in the country that is called Apulia’ (translated by Anne Van Arsdall, in Medieval Herbal Remedies: The Old English Herbarium and Anglo-Saxon Medicine p. 154).", "The Herbal also includes lore about other plants, such as the mandrake."])
|
1423
|
+
end
|
1424
|
+
|
1425
|
+
it 'correctly segments text #121' do
|
1426
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Here’s the - ahem - official citation: Baker, C., Anderson, Kenneth, Martin, James, & Palen, Leysia. Modeling Open Source Software Communities, ProQuest Dissertations and Theses.")
|
1427
|
+
expect(ps.segment).to eq(["Here’s the - ahem - official citation: Baker, C., Anderson, Kenneth, Martin, James, & Palen, Leysia.", "Modeling Open Source Software Communities, ProQuest Dissertations and Theses."])
|
1428
|
+
end
|
1414
1429
|
end
|
1415
1430
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.20
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-08-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|