pragmatic_segmenter 0.3.20 → 0.3.21

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 49b8f9ed555c4c3adba18700c7a910e3b67083aa
4
- data.tar.gz: 0907aa732977255028708a162eece4e6225634da
3
+ metadata.gz: a13bc873b763cc68e4bfebcd0c291d3eab12a429
4
+ data.tar.gz: 85666c4d354ba1a0e5118c0e3e23b9c4d1978ae4
5
5
  SHA512:
6
- metadata.gz: cfe1b18483d7fbe83d4bfe52971ab7ca95321e89c375e06d1c0e5686c7f2818930cf842a2963fb69bb08b213e6c5480a1fe1171610a8d08629089b028e5899f2
7
- data.tar.gz: 4b0d8fdd97ce03c6e205e3c47312486e157ffa926e8ddb50f1e825ae75327379b3aee0dbbc61322bb15f1594535aecbf839a5842893f82d9a74caf2389a7f065
6
+ metadata.gz: f3c211daa3aaf71d4ff1d363ee15711c8979db6fa3cf9fb87d84108466f0d8fe9ee7b9694ad32a6439a79c471ab013e70f536601d951a7e9400addb954f35c76
7
+ data.tar.gz: de40f070a216d90cff094bb6cfdb83b1f1c108774ff67da18e58846fc69aab78decc234f0b0c7fda7931ffcc7e8ca1e427d28fd495922a28aa91df02981917e6
data/NEWS CHANGED
@@ -1,3 +1,8 @@
1
+ 0.3.21 (2018-08-30):
2
+
3
+ * Improvement: Add support for file formats
4
+ * Improvement: Add support for numeric references at the end of a sentence (i.e. Wikipedia references)
5
+
1
6
  0.3.20 (2018-08-28):
2
7
 
3
8
  * Improvement: Handle slanted single quotation as a single quote
data/README.md CHANGED
@@ -874,6 +874,10 @@ To test the relative performance of different segmentation tools and libraries I
874
874
  * Add support for Chinese caret brackets
875
875
  * Add viz as abbreviation
876
876
 
877
+ **Version 0.3.21**
878
+ * Add support for file formats
879
+ * Add support for numeric references at the end of a sentence (i.e. Wikipedia references)
880
+
877
881
  ## Contributing
878
882
 
879
883
  If you find a text that is incorrectly segmented using this gem, please submit an issue.
@@ -24,6 +24,8 @@ module PragmaticSegmenter
24
24
  # Rubular: http://rubular.com/r/G2opjedIm9
25
25
  GeoLocationRule = Rule.new(/(?<=[a-zA-z]°)\.(?=\s*\d+)/, '∯')
26
26
 
27
+ FileFormatRule = Rule.new(/(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)/, '∯')
28
+
27
29
  SingleNewLineRule = Rule.new(/\n/, 'ȹ')
28
30
 
29
31
  module DoublePunctuationRules
@@ -47,6 +47,8 @@ module PragmaticSegmenter
47
47
  # Rubular: http://rubular.com/r/mQ8Es9bxtk
48
48
  CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
49
49
 
50
+ NUMBERED_REFERENCE_REGEX = /(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)*\b\d{1,3}\])+|((\d{1,3}\s?)*\d{1,3}))(\s)(?=[A-Z])/
51
+
50
52
  # Rubular: http://rubular.com/r/yqa4Rit8EY
51
53
  PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
52
54
 
@@ -23,8 +23,10 @@ module PragmaticSegmenter
23
23
  replace_abbreviations
24
24
  replace_numbers
25
25
  replace_continuous_punctuation
26
+ replace_periods_before_numeric_references
26
27
  @text.apply(@language::Abbreviations::WithMultiplePeriodsAndEmailRule)
27
28
  @text.apply(@language::GeoLocationRule)
29
+ @text.apply(@language::FileFormatRule)
28
30
  split_into_segments
29
31
  end
30
32
 
@@ -68,6 +70,10 @@ module PragmaticSegmenter
68
70
  end
69
71
  end
70
72
 
73
+ def replace_periods_before_numeric_references
74
+ @text.gsub!(@language::NUMBERED_REFERENCE_REGEX, "∯\\2\r\\7")
75
+ end
76
+
71
77
  def consecutive_underscore?(txt)
72
78
  # Rubular: http://rubular.com/r/fTF2Ff3WBL
73
79
  txt.gsub(/_{3,}/, '').length.eql?(0)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module PragmaticSegmenter
4
- VERSION = "0.3.20"
4
+ VERSION = "0.3.21"
5
5
  end
@@ -1426,5 +1426,41 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
1426
1426
  ps = PragmaticSegmenter::Segmenter.new(text: "Here’s the - ahem - official citation: Baker, C., Anderson, Kenneth, Martin, James, & Palen, Leysia. Modeling Open Source Software Communities, ProQuest Dissertations and Theses.")
1427
1427
  expect(ps.segment).to eq(["Here’s the - ahem - official citation: Baker, C., Anderson, Kenneth, Martin, James, & Palen, Leysia.", "Modeling Open Source Software Communities, ProQuest Dissertations and Theses."])
1428
1428
  end
1429
+
1430
+ it 'correctly segments text #122' do
1431
+ ps = PragmaticSegmenter::Segmenter.new(text: "These include images of various modes of transport and members of the team, all available in .jpeg format. Images can be downloaded from our website. We also offer archives as .zip files.")
1432
+ expect(ps.segment).to eq(["These include images of various modes of transport and members of the team, all available in .jpeg format.", "Images can be downloaded from our website.", "We also offer archives as .zip files."])
1433
+ end
1434
+
1435
+ it 'correctly segments text #123' do
1436
+ ps = PragmaticSegmenter::Segmenter.new(text: "Saint Maximus (died 250) is a Christian saint and martyr.[1] The emperor Decius published a decree ordering the veneration of busts of the deified emperors.")
1437
+ expect(ps.segment).to eq(["Saint Maximus (died 250) is a Christian saint and martyr.[1]", "The emperor Decius published a decree ordering the veneration of busts of the deified emperors."])
1438
+ end
1439
+
1440
+ it 'correctly segments text #124' do
1441
+ ps = PragmaticSegmenter::Segmenter.new(text: "Differing agendas can potentially create an understanding gap in a consultation.11 12 Take the example of one of the most common presentations in ill health: the common cold.")
1442
+ expect(ps.segment).to eq(["Differing agendas can potentially create an understanding gap in a consultation.11 12", "Take the example of one of the most common presentations in ill health: the common cold."])
1443
+ end
1444
+
1445
+ it 'correctly segments text #125' do
1446
+ ps = PragmaticSegmenter::Segmenter.new(text: "Daniel Kahneman popularised the concept of fast and slow thinking: the distinction between instinctive (type 1 thinking) and reflective, analytical cognition (type 2).10 This model relates to doctors achieving a balance between efficiency and effectiveness.")
1447
+ expect(ps.segment).to eq(["Daniel Kahneman popularised the concept of fast and slow thinking: the distinction between instinctive (type 1 thinking) and reflective, analytical cognition (type 2).10", "This model relates to doctors achieving a balance between efficiency and effectiveness."])
1448
+ end
1449
+
1450
+ it 'correctly segments text #126' do
1451
+ ps = PragmaticSegmenter::Segmenter.new(text: "Its traditional use[1] is well documented in the ethnobotanical literature [2–11]. Leaves, buds, tar and essential oils are used to treat a wide spectrum of diseases.")
1452
+ expect(ps.segment).to eq(["Its traditional use[1] is well documented in the ethnobotanical literature [2–11].", "Leaves, buds, tar and essential oils are used to treat a wide spectrum of diseases."])
1453
+ end
1454
+
1455
+ it 'correctly segments text #127' do
1456
+ ps = PragmaticSegmenter::Segmenter.new(text: "Thus increasing the desire for political reform both in Lancashire and in the country at large.[7][8] This was a serious misdemeanour,[16] encouraging them to declare the assembly illegal as soon as it was announced on 31 July.[17][18] The radicals sought a second opinion on the meeting's legality.")
1457
+ expect(ps.segment).to eq(["Thus increasing the desire for political reform both in Lancashire and in the country at large.[7][8]", "This was a serious misdemeanour,[16] encouraging them to declare the assembly illegal as soon as it was announced on 31 July.[17][18]", "The radicals sought a second opinion on the meeting's legality."])
1458
+ end
1459
+
1460
+ it 'correctly segments text #128' do
1461
+ ps = PragmaticSegmenter::Segmenter.new(text: "The table in (4) is a sample from the Wall Street Journal (1987).1 According to the distribution all the pairs given in (4) count as candidates for abbreviations.")
1462
+ expect(ps.segment).to eq([ "The table in (4) is a sample from the Wall Street Journal (1987).1", "According to the distribution all the pairs given in (4) count as candidates for abbreviations."])
1463
+
1464
+ end
1429
1465
  end
1430
1466
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.20
4
+ version: 0.3.21
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-08-27 00:00:00.000000000 Z
11
+ date: 2018-08-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode