pragmatic_segmenter 0.3.17 → 0.3.18

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/NEWS +4 -0
  3. data/README.md +4 -1
  4. data/lib/pragmatic_segmenter/abbreviation_replacer.rb +10 -15
  5. data/lib/pragmatic_segmenter/between_punctuation.rb +1 -0
  6. data/lib/pragmatic_segmenter/cleaner.rb +3 -1
  7. data/lib/pragmatic_segmenter/cleaner/rules.rb +2 -0
  8. data/lib/pragmatic_segmenter/exclamation_words.rb +8 -7
  9. data/lib/pragmatic_segmenter/languages.rb +2 -0
  10. data/lib/pragmatic_segmenter/languages/amharic.rb +2 -0
  11. data/lib/pragmatic_segmenter/languages/arabic.rb +2 -0
  12. data/lib/pragmatic_segmenter/languages/armenian.rb +2 -0
  13. data/lib/pragmatic_segmenter/languages/bulgarian.rb +2 -0
  14. data/lib/pragmatic_segmenter/languages/burmese.rb +2 -0
  15. data/lib/pragmatic_segmenter/languages/chinese.rb +2 -0
  16. data/lib/pragmatic_segmenter/languages/common.rb +2 -0
  17. data/lib/pragmatic_segmenter/languages/common/ellipsis.rb +1 -0
  18. data/lib/pragmatic_segmenter/languages/common/numbers.rb +1 -0
  19. data/lib/pragmatic_segmenter/languages/danish.rb +2 -0
  20. data/lib/pragmatic_segmenter/languages/deutsch.rb +2 -0
  21. data/lib/pragmatic_segmenter/languages/dutch.rb +2 -0
  22. data/lib/pragmatic_segmenter/languages/english.rb +2 -0
  23. data/lib/pragmatic_segmenter/languages/french.rb +2 -0
  24. data/lib/pragmatic_segmenter/languages/greek.rb +2 -0
  25. data/lib/pragmatic_segmenter/languages/hindi.rb +2 -0
  26. data/lib/pragmatic_segmenter/languages/italian.rb +2 -0
  27. data/lib/pragmatic_segmenter/languages/japanese.rb +2 -0
  28. data/lib/pragmatic_segmenter/languages/persian.rb +2 -0
  29. data/lib/pragmatic_segmenter/languages/polish.rb +2 -0
  30. data/lib/pragmatic_segmenter/languages/russian.rb +2 -0
  31. data/lib/pragmatic_segmenter/languages/spanish.rb +2 -0
  32. data/lib/pragmatic_segmenter/languages/urdu.rb +2 -0
  33. data/lib/pragmatic_segmenter/list.rb +6 -6
  34. data/lib/pragmatic_segmenter/processor.rb +2 -0
  35. data/lib/pragmatic_segmenter/punctuation_replacer.rb +2 -1
  36. data/lib/pragmatic_segmenter/segmenter.rb +2 -0
  37. data/lib/pragmatic_segmenter/types.rb +2 -0
  38. data/lib/pragmatic_segmenter/version.rb +3 -1
  39. metadata +3 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: baf0f9cb38a40398530e15df0d28ab8a654c49c6
4
- data.tar.gz: 4a184ffa70092a3f99fe7d194a71e7a935b1d3b2
3
+ metadata.gz: 3bb581e56e988521adc41dbb94bc7281ee7dfa95
4
+ data.tar.gz: 0c3b6fe877a5d39d36053b7ed68a860b5d17779b
5
5
  SHA512:
6
- metadata.gz: 089a40744464256d33ce30b4c6d40a6f00bcd5dd0178757a3d65b3f5ff18242751e5074902bd0aab0460e97e5db0040a30f206648e98c14ec861abf0ba92680c
7
- data.tar.gz: e49bdbe345e2d2e394d2f4ef03eac2ca0488f3dac158dfd5c193a328368326963785bf643a565d599d0e4d49196e3480d7f8054ec1df4927c192958bb1e1de47
6
+ metadata.gz: 1b1dd64b5a382e8bb7ed5d79fbb9264565d71088f234ba4a9bd7cae47184e1c64f78b32a72f39326aa31936c6c6742aa2ff75cd75cd1b328987a6061a4d2534b
7
+ data.tar.gz: c150b178c93b7183300559e89c117cf8f9f93adf6ef33790a3ce0b292c66588fe6461c724210f7f93e078d2d08a10e995d7234bad963f8d5aa1c52c378effe5e
data/NEWS CHANGED
@@ -1,3 +1,7 @@
1
+ 0.3.18 (2018-03-27):
2
+
3
+ * Improvement: Performance optimizations
4
+
1
5
  0.3.17 (2017-12-07):
2
6
 
3
7
  * Bug Fix: Regex for parsing HTML
data/README.md CHANGED
@@ -77,7 +77,7 @@ Pragmatic Segmenter is opinionated and made for the explicit purpose of segmenti
77
77
  Pragmatic Segmenter is specifically used for the purpose of segmenting texts for use in translation (and translation memory) related applications. Therefore Pragmatic Segmenter takes a stance on some formatting and segmentation gray areas with the goal of improving the segmentation for the above stated purpose. Some examples:
78
78
 
79
79
  - Removes 'table of contents' style long string of periods ('............')
80
- - Keeps parenthetical sentences within a sentence as one segment for clarity even though technically there are multiple grammatical sentences within the segment
80
+ - Keeps parentheticals, quotations, and parentheticals or quotations within a sentence as one segment for clarity even though technically there may be multiple grammatical sentences within the segment
81
81
  - Strips out any xhtml code
82
82
  - Conservative in cases where the sentence boundary is ambigious and Pragmatic Segmenter does not have a built in rule
83
83
 
@@ -862,6 +862,9 @@ To test the relative performance of different segmentation tools and libraries I
862
862
  **Version 0.3.17**
863
863
  * Fix issue involving the HTML regex in the cleaner
864
864
 
865
+ **Version 0.3.18**
866
+ * Performance optimizations
867
+
865
868
  ## Contributing
866
869
 
867
870
  If you find a text that is incorrectly segmented using this gem, please submit an issue.
@@ -1,4 +1,6 @@
1
1
  # -*- encoding : utf-8 -*-
2
+ # frozen_string_literal: true
3
+
2
4
  require 'unicode'
3
5
 
4
6
  module PragmaticSegmenter
@@ -28,11 +30,12 @@ module PragmaticSegmenter
28
30
  def search_for_abbreviations_in_string(txt)
29
31
  original = txt.dup
30
32
  downcased = Unicode::downcase(txt)
31
- @language::Abbreviation::ABBREVIATIONS.each do |a|
32
- next unless downcased.include?(a.strip)
33
- abbrev_match = original.scan(/(?:^|\s|\r|\n)#{Regexp.escape(a.strip)}/i)
33
+ @language::Abbreviation::ABBREVIATIONS.each do |abbreviation|
34
+ stripped = abbreviation.strip
35
+ next unless downcased.include?(stripped)
36
+ abbrev_match = original.scan(/(?:^|\s|\r|\n)#{Regexp.escape(stripped)}/i)
34
37
  next if abbrev_match.empty?
35
- next_word_start = /(?<=#{Regexp.escape(a.strip)} ).{1}/
38
+ next_word_start = /(?<=#{Regexp.escape(stripped)} ).{1}/
36
39
  character_array = @text.scan(next_word_start)
37
40
  abbrev_match.each_with_index do |am, index|
38
41
  txt = scan_for_replacements(txt, am, index, character_array)
@@ -74,19 +77,11 @@ module PragmaticSegmenter
74
77
  # and try to cover the words that most often start a
75
78
  # sentence but could never follow one of the abbreviations below.
76
79
 
80
+ # Rubular: http://rubular.com/r/PkBQ3PVBS8
77
81
  @language::AbbreviationReplacer::SENTENCE_STARTERS.each do |word|
78
82
  escaped = Regexp.escape(word)
79
- txt.gsub!(/U∯S∯\s#{escaped}\s/, "U∯S\.\s#{escaped}\s")
80
- txt.gsub!(/U\.S∯\s#{escaped}\s/, "U\.S\.\s#{escaped}\s")
81
- txt.gsub!(/U∯K∯\s#{escaped}\s/, "U∯K\.\s#{escaped}\s")
82
- txt.gsub!(/U\.K∯\s#{escaped}\s/, "U\.K\.\s#{escaped}\s")
83
- txt.gsub!(/E∯U∯\s#{escaped}\s/, "E∯U\.\s#{escaped}\s")
84
- txt.gsub!(/E\.U∯\s#{escaped}\s/, "E\.U\.\s#{escaped}\s")
85
- txt.gsub!(/U∯S∯A∯\s#{escaped}\s/, "U∯S∯A\.\s#{escaped}\s")
86
- txt.gsub!(/U\.S\.A∯\s#{escaped}\s/, "U\.S\.A\.\s#{escaped}\s")
87
- txt.gsub!(/I∯\s#{escaped}\s/, "I\.\s#{escaped}\s")
88
- txt.gsub!(/i.v∯\s#{escaped}\s/, "i\.v\.\s#{escaped}\s")
89
- txt.gsub!(/I.V∯\s#{escaped}\s/, "I\.V\.\s#{escaped}\s")
83
+ regex = /(U∯S|U\.S|UK|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯(?=\s#{escaped}\s)/
84
+ txt.gsub!(regex, '\1.')
90
85
  end
91
86
  txt
92
87
  end
@@ -1,4 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
+ # frozen_string_literal: true
2
3
 
3
4
  module PragmaticSegmenter
4
5
  # This class searches for punctuation between quotes or parenthesis
@@ -1,4 +1,6 @@
1
1
  # -*- encoding : utf-8 -*-
2
+ # frozen_string_literal: true
3
+
2
4
  require_relative 'cleaner/rules'
3
5
 
4
6
  module PragmaticSegmenter
@@ -62,7 +64,7 @@ module PragmaticSegmenter
62
64
 
63
65
  def replace_punctuation_in_brackets
64
66
  @text.dup.gsub!(/\[(?:[^\]])*\]/) do |match|
65
- @text.gsub!(/#{Regexp.escape(match)}/, "#{match.dup.gsub!(/\?/, '&ᓷ&')}") if match.include?('?')
67
+ @text.gsub!(/#{Regexp.escape(match)}/, match.dup.gsub!(/\?/, '&ᓷ&')) if match.include?('?')
66
68
  end
67
69
  end
68
70
 
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  # This is an opinionated class that removes errant newlines,
3
5
  # xhtml, inline formatting, etc.
@@ -1,19 +1,20 @@
1
1
  # -*- encoding : utf-8 -*-
2
+ # frozen_string_literal: true
3
+
2
4
  require 'pragmatic_segmenter/punctuation_replacer'
3
5
 
4
6
  module PragmaticSegmenter
5
7
  # This class searches for exclamation points that
6
8
  # are part of words and not ending punctuation and replaces them.
7
9
  module ExclamationWords
8
- WORDS_WITH_EXCLAMATIONS = ['!Xũ', '!Kung', 'ǃʼOǃKung', '!Xuun', '!Kung-Ekoka', 'ǃHu', 'ǃKhung', 'ǃKu', 'ǃung', 'ǃXo', 'ǃXû', 'ǃXung', 'ǃXũ', '!Xun', 'Yahoo!', 'Y!J', 'Yum!']
10
+ EXCLAMATION_WORDS = %w[!Xũ !Kung ǃʼOǃKung !Xuun !Kung-Ekoka ǃHu ǃKhung ǃKu ǃung ǃXo ǃXû ǃXung ǃXũ !Xun Yahoo! Y!J Yum!].freeze
11
+ REGEXP = Regexp.new(EXCLAMATION_WORDS.map { |string| Regexp.escape(string) }.join('|'))
9
12
 
10
13
  def self.apply_rules(text)
11
- WORDS_WITH_EXCLAMATIONS.each do |exclamation|
12
- PragmaticSegmenter::PunctuationReplacer.new(
13
- matches_array: text.scan(/#{Regexp.escape(exclamation)}/),
14
- text: text
15
- ).replace
16
- end
14
+ PragmaticSegmenter::PunctuationReplacer.new(
15
+ matches_array: text.scan(REGEXP),
16
+ text: text
17
+ ).replace
17
18
  end
18
19
  end
19
20
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'pragmatic_segmenter/types'
2
4
  require 'pragmatic_segmenter/processor'
3
5
  require 'pragmatic_segmenter/cleaner'
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Amharic
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Arabic
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Armenian
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Bulgarian
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Burmese
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Chinese
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require_relative 'common/numbers'
2
4
  require_relative 'common/ellipsis'
3
5
 
@@ -1,4 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
+ # frozen_string_literal: true
2
3
 
3
4
  module PragmaticSegmenter
4
5
  module Languages
@@ -1,4 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
+ # frozen_string_literal: true
2
3
 
3
4
  module PragmaticSegmenter
4
5
  module Languages
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Danish
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Deutsch
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Dutch
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module English
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module French
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Greek
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Hindi
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Italian
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Japanese
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Persian
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Polish
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Russian
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Spanish
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  module Languages
3
5
  module Urdu
@@ -1,4 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
+ # frozen_string_literal: true
2
3
 
3
4
  module PragmaticSegmenter
4
5
  # This class searches for a list within a string and adds
@@ -41,6 +42,10 @@ module PragmaticSegmenter
41
42
  ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX =
42
43
  /(?<=^)[a-z]\.|(?<=\A)[a-z]\.|(?<=\s)[a-z]\./i
43
44
 
45
+ # Rubular: http://rubular.com/r/GcnmQt4a3I
46
+ ROMAN_NUMERALS_IN_PARENTHESES =
47
+ /\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])/
48
+
44
49
  attr_reader :text
45
50
  def initialize(text:)
46
51
  @text = Text.new(text)
@@ -54,12 +59,7 @@ module PragmaticSegmenter
54
59
  end
55
60
 
56
61
  def replace_parens
57
- ROMAN_NUMERALS.each do |rm|
58
- next unless text =~ /\(#{Regexp.escape(rm)}\)\s[A-Z]/
59
- text.gsub!(/\(#{Regexp.escape(rm)}\)(?=\s[A-Z])/) do |match|
60
- match.gsub!(/\(/, '&✂&').gsub!(/\)/, '&⌬&')
61
- end
62
- end
62
+ text.gsub!(ROMAN_NUMERALS_IN_PARENTHESES, '&✂&\1&⌬&'.freeze)
63
63
  text
64
64
  end
65
65
 
@@ -1,4 +1,6 @@
1
1
  # -*- encoding : utf-8 -*-
2
+ # frozen_string_literal: true
3
+
2
4
  require 'pragmatic_segmenter/punctuation_replacer'
3
5
  require 'pragmatic_segmenter/between_punctuation'
4
6
 
@@ -1,4 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
+ # frozen_string_literal: true
2
3
 
3
4
  module PragmaticSegmenter
4
5
  # This class replaces punctuation that is typically a sentence boundary
@@ -63,7 +64,7 @@ module PragmaticSegmenter
63
64
 
64
65
  def sub_characters(string, char_a, char_b)
65
66
  sub = string.gsub(char_a, char_b)
66
- @text.gsub!(/#{Regexp.escape(string)}/, "#{sub}")
67
+ @text.gsub!(/#{Regexp.escape(string)}/, sub)
67
68
  sub
68
69
  end
69
70
  end
@@ -1,4 +1,6 @@
1
1
  # -*- encoding : utf-8 -*-
2
+ # frozen_string_literal: true
3
+
2
4
  require 'pragmatic_segmenter/languages'
3
5
 
4
6
  module PragmaticSegmenter
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
4
  Rule = Struct.new(:pattern, :replacement)
3
5
 
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module PragmaticSegmenter
2
- VERSION = "0.3.17"
4
+ VERSION = "0.3.18"
3
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.17
4
+ version: 0.3.18
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-12-07 00:00:00.000000000 Z
11
+ date: 2018-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -180,7 +180,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
180
180
  version: '0'
181
181
  requirements: []
182
182
  rubyforge_project:
183
- rubygems_version: 2.6.14
183
+ rubygems_version: 2.4.1
184
184
  signing_key:
185
185
  specification_version: 4
186
186
  summary: A rule-based sentence boundary detection gem that works out-of-the-box across