pragmatic_segmenter 0.3.19 → 0.3.20

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3f03709232bf5b433b73de74a04b254228bc3ce6
4
- data.tar.gz: 087860da234619d431849de1f85614b876e6d31a
3
+ metadata.gz: 49b8f9ed555c4c3adba18700c7a910e3b67083aa
4
+ data.tar.gz: 0907aa732977255028708a162eece4e6225634da
5
5
  SHA512:
6
- metadata.gz: 6d2759f5a302f6acf40b34990df8fd20d352b0aef5dfdd1799e5f7303d065466964566c068c505460837e838a8fa296754031084ef42715aab709f5d7a2f882c
7
- data.tar.gz: 0ad1c988cff72e818fe9fdd4de77831e792e20834f27c6a40b34f594e8ef5a9927fdf1da259cdcd7a7d990b592c53f09bba11c59432b74d3b6a6f46615186378
6
+ metadata.gz: cfe1b18483d7fbe83d4bfe52971ab7ca95321e89c375e06d1c0e5686c7f2818930cf842a2963fb69bb08b213e6c5480a1fe1171610a8d08629089b028e5899f2
7
+ data.tar.gz: 4b0d8fdd97ce03c6e205e3c47312486e157ffa926e8ddb50f1e825ae75327379b3aee0dbbc61322bb15f1594535aecbf839a5842893f82d9a74caf2389a7f065
data/NEWS CHANGED
@@ -1,3 +1,10 @@
1
+ 0.3.20 (2018-08-28):
2
+
3
+ * Improvement: Handle slanted single quotation as a single quote
4
+ * Bug Fix: The text contains a single character abbreviation as part of a list
5
+ * Bug Fix: Chinese book quotes
6
+ * Improvement: Add viz as abbreviation
7
+
1
8
  0.3.19 (2018-07-19):
2
9
 
3
10
  * Bug Fix: A parenthetical following an abbreviation is now included as part of the same segment. Example: "The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)." is now treated as one segment.
data/README.md CHANGED
@@ -868,6 +868,12 @@ To test the relative performance of different segmentation tools and libraries I
868
868
  **Version 0.3.19**
869
869
  * Treat a parenthetical following an abbreviation as part of the same segment
870
870
 
871
+ **Version 0.3.20**
872
+ * Handle slanted single quotation as a single quote
873
+ * Handle a single character abbreviation as part of a list
874
+ * Add support for Chinese caret brackets
875
+ * Add viz as abbreviation
876
+
871
877
  ## Contributing
872
878
 
873
879
  If you find a text that is incorrectly segmented using this gem, please submit an issue.
@@ -8,6 +8,8 @@ module PragmaticSegmenter
8
8
  # Rubular: http://rubular.com/r/2YFrKWQUYi
9
9
  BETWEEN_SINGLE_QUOTES_REGEX = /(?<=\s)'(?:[^']|'[a-zA-Z])*'/
10
10
 
11
+ BETWEEN_SINGLE_QUOTE_SLANTED_REGEX = /(?<=\s)‘(?:[^’]|’[a-zA-Z])*’/
12
+
11
13
  # Rubular: http://rubular.com/r/3Pw1QlXOjd
12
14
  BETWEEN_DOUBLE_QUOTES_REGEX = /"(?>[^"\\]+|\\{2}|\\.)*"/
13
15
 
@@ -42,6 +44,7 @@ module PragmaticSegmenter
42
44
 
43
45
  def sub_punctuation_between_quotes_and_parens(txt)
44
46
  sub_punctuation_between_single_quotes(txt)
47
+ sub_punctuation_between_single_quote_slanted(txt)
45
48
  sub_punctuation_between_double_quotes(txt)
46
49
  sub_punctuation_between_square_brackets(txt)
47
50
  sub_punctuation_between_parens(txt)
@@ -74,6 +77,13 @@ module PragmaticSegmenter
74
77
  end
75
78
  end
76
79
 
80
+ def sub_punctuation_between_single_quote_slanted(txt)
81
+ PragmaticSegmenter::PunctuationReplacer.new(
82
+ matches_array: txt.scan(BETWEEN_SINGLE_QUOTE_SLANTED_REGEX),
83
+ text: txt
84
+ ).replace
85
+ end
86
+
77
87
  def sub_punctuation_between_double_quotes(txt)
78
88
  PragmaticSegmenter::PunctuationReplacer.new(
79
89
  matches_array: btwn_dbl_quote(txt),
@@ -8,6 +8,32 @@ module PragmaticSegmenter
8
8
  class AbbreviationReplacer < AbbreviationReplacer
9
9
  SENTENCE_STARTERS = [].freeze
10
10
  end
11
+
12
+ class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
13
+ BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX = /《(?>[^》\\]+|\\{2}|\\.)*》/
14
+ BETWEEN_L_BRACKET_REGEX = /「(?>[^」\\]+|\\{2}|\\.)*」/
15
+ private
16
+
17
+ def sub_punctuation_between_quotes_and_parens(txt)
18
+ super
19
+ sub_punctuation_between_double_angled_quotation_marks(txt)
20
+ sub_punctuation_between_l_bracket(txt)
21
+ end
22
+
23
+ def sub_punctuation_between_double_angled_quotation_marks(txt)
24
+ PunctuationReplacer.new(
25
+ matches_array: txt.scan(BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX),
26
+ text: txt
27
+ ).replace
28
+ end
29
+
30
+ def sub_punctuation_between_l_bracket(txt)
31
+ PunctuationReplacer.new(
32
+ matches_array: txt.scan(BETWEEN_L_BRACKET_REGEX),
33
+ text: txt
34
+ ).replace
35
+ end
36
+ end
11
37
  end
12
38
  end
13
39
  end
@@ -11,7 +11,7 @@ module PragmaticSegmenter
11
11
 
12
12
  # Defines the abbreviations for each language (if available)
13
13
  module Abbreviation
14
- ABBREVIATIONS = Set.new(['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']).freeze
14
+ ABBREVIATIONS = Set.new(['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']).freeze
15
15
  PREPOSITIVE_ABBREVIATIONS = Set.new(['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']).freeze
16
16
  NUMBER_ABBREVIATIONS = Set.new(['art', 'ext', 'no', 'nos', 'p', 'pp']).freeze
17
17
  end
@@ -76,10 +76,10 @@ module PragmaticSegmenter
76
76
  # replaces the periods.
77
77
  module SingleLetterAbbreviationRules
78
78
  # Rubular: http://rubular.com/r/e3H6kwnr6H
79
- SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=\s)/, '∯')
79
+ SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=,?\s)/, '∯')
80
80
 
81
81
  # Rubular: http://rubular.com/r/gitvf0YWH4
82
- SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=\s)/, '∯')
82
+ SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=,?\s)/, '∯')
83
83
 
84
84
  All = [
85
85
  SingleUpperCaseLetterAtStartOfLineRule,
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module PragmaticSegmenter
4
- VERSION = "0.3.19"
4
+ VERSION = "0.3.20"
5
5
  end
@@ -7,5 +7,10 @@ RSpec.describe PragmaticSegmenter::Languages::Chinese, '(zh)' do
7
7
  ps = PragmaticSegmenter::Segmenter.new(text: "安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。", language: 'zh')
8
8
  expect(ps.segment).to eq(["安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。", "周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。"])
9
9
  end
10
+
11
+ it 'correctly segments text #002' do
12
+ ps = PragmaticSegmenter::Segmenter.new(text: "我们明天一起去看《摔跤吧!爸爸》好吗?好!", language: 'zh')
13
+ expect(ps.segment).to eq(["我们明天一起去看《摔跤吧!爸爸》好吗?", "好!"])
14
+ end
10
15
  end
11
16
  end
@@ -1411,5 +1411,20 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
1411
1411
  ps = PragmaticSegmenter::Segmenter.new(text: text, clean: false)
1412
1412
  expect(ps.segment).to eq(["The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)."])
1413
1413
  end
1414
+
1415
+ it 'correctly segments text #119' do
1416
+ ps = PragmaticSegmenter::Segmenter.new(text: "Unlike the abbreviations i.e. and e.g., viz. is used to indicate a detailed description of something stated before.")
1417
+ expect(ps.segment).to eq(["Unlike the abbreviations i.e. and e.g., viz. is used to indicate a detailed description of something stated before."])
1418
+ end
1419
+
1420
+ it 'correctly segments text #120' do
1421
+ ps = PragmaticSegmenter::Segmenter.new(text: "For example, ‘dragonswort… is said that it should be grown in dragon’s blood. It grows at the tops of mountains where there are groves of trees, chiefly in holy places and in the country that is called Apulia’ (translated by Anne Van Arsdall, in Medieval Herbal Remedies: The Old English Herbarium and Anglo-Saxon Medicine p. 154). The Herbal also includes lore about other plants, such as the mandrake.")
1422
+ expect(ps.segment).to eq(["For example, ‘dragonswort… is said that it should be grown in dragon’s blood. It grows at the tops of mountains where there are groves of trees, chiefly in holy places and in the country that is called Apulia’ (translated by Anne Van Arsdall, in Medieval Herbal Remedies: The Old English Herbarium and Anglo-Saxon Medicine p. 154).", "The Herbal also includes lore about other plants, such as the mandrake."])
1423
+ end
1424
+
1425
+ it 'correctly segments text #121' do
1426
+ ps = PragmaticSegmenter::Segmenter.new(text: "Here’s the - ahem - official citation: Baker, C., Anderson, Kenneth, Martin, James, & Palen, Leysia. Modeling Open Source Software Communities, ProQuest Dissertations and Theses.")
1427
+ expect(ps.segment).to eq(["Here’s the - ahem - official citation: Baker, C., Anderson, Kenneth, Martin, James, & Palen, Leysia.", "Modeling Open Source Software Communities, ProQuest Dissertations and Theses."])
1428
+ end
1414
1429
  end
1415
1430
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.19
4
+ version: 0.3.20
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-07-19 00:00:00.000000000 Z
11
+ date: 2018-08-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode