pragmatic_segmenter 0.3.17 → 0.3.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/NEWS +4 -0
- data/README.md +4 -1
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +10 -15
- data/lib/pragmatic_segmenter/between_punctuation.rb +1 -0
- data/lib/pragmatic_segmenter/cleaner.rb +3 -1
- data/lib/pragmatic_segmenter/cleaner/rules.rb +2 -0
- data/lib/pragmatic_segmenter/exclamation_words.rb +8 -7
- data/lib/pragmatic_segmenter/languages.rb +2 -0
- data/lib/pragmatic_segmenter/languages/amharic.rb +2 -0
- data/lib/pragmatic_segmenter/languages/arabic.rb +2 -0
- data/lib/pragmatic_segmenter/languages/armenian.rb +2 -0
- data/lib/pragmatic_segmenter/languages/bulgarian.rb +2 -0
- data/lib/pragmatic_segmenter/languages/burmese.rb +2 -0
- data/lib/pragmatic_segmenter/languages/chinese.rb +2 -0
- data/lib/pragmatic_segmenter/languages/common.rb +2 -0
- data/lib/pragmatic_segmenter/languages/common/ellipsis.rb +1 -0
- data/lib/pragmatic_segmenter/languages/common/numbers.rb +1 -0
- data/lib/pragmatic_segmenter/languages/danish.rb +2 -0
- data/lib/pragmatic_segmenter/languages/deutsch.rb +2 -0
- data/lib/pragmatic_segmenter/languages/dutch.rb +2 -0
- data/lib/pragmatic_segmenter/languages/english.rb +2 -0
- data/lib/pragmatic_segmenter/languages/french.rb +2 -0
- data/lib/pragmatic_segmenter/languages/greek.rb +2 -0
- data/lib/pragmatic_segmenter/languages/hindi.rb +2 -0
- data/lib/pragmatic_segmenter/languages/italian.rb +2 -0
- data/lib/pragmatic_segmenter/languages/japanese.rb +2 -0
- data/lib/pragmatic_segmenter/languages/persian.rb +2 -0
- data/lib/pragmatic_segmenter/languages/polish.rb +2 -0
- data/lib/pragmatic_segmenter/languages/russian.rb +2 -0
- data/lib/pragmatic_segmenter/languages/spanish.rb +2 -0
- data/lib/pragmatic_segmenter/languages/urdu.rb +2 -0
- data/lib/pragmatic_segmenter/list.rb +6 -6
- data/lib/pragmatic_segmenter/processor.rb +2 -0
- data/lib/pragmatic_segmenter/punctuation_replacer.rb +2 -1
- data/lib/pragmatic_segmenter/segmenter.rb +2 -0
- data/lib/pragmatic_segmenter/types.rb +2 -0
- data/lib/pragmatic_segmenter/version.rb +3 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3bb581e56e988521adc41dbb94bc7281ee7dfa95
|
4
|
+
data.tar.gz: 0c3b6fe877a5d39d36053b7ed68a860b5d17779b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b1dd64b5a382e8bb7ed5d79fbb9264565d71088f234ba4a9bd7cae47184e1c64f78b32a72f39326aa31936c6c6742aa2ff75cd75cd1b328987a6061a4d2534b
|
7
|
+
data.tar.gz: c150b178c93b7183300559e89c117cf8f9f93adf6ef33790a3ce0b292c66588fe6461c724210f7f93e078d2d08a10e995d7234bad963f8d5aa1c52c378effe5e
|
data/NEWS
CHANGED
data/README.md
CHANGED
@@ -77,7 +77,7 @@ Pragmatic Segmenter is opinionated and made for the explicit purpose of segmenti
|
|
77
77
|
Pragmatic Segmenter is specifically used for the purpose of segmenting texts for use in translation (and translation memory) related applications. Therefore Pragmatic Segmenter takes a stance on some formatting and segmentation gray areas with the goal of improving the segmentation for the above stated purpose. Some examples:
|
78
78
|
|
79
79
|
- Removes 'table of contents' style long string of periods ('............')
|
80
|
-
- Keeps
|
80
|
+
- Keeps parentheticals, quotations, and parentheticals or quotations within a sentence as one segment for clarity even though technically there may be multiple grammatical sentences within the segment
|
81
81
|
- Strips out any xhtml code
|
82
82
|
- Conservative in cases where the sentence boundary is ambigious and Pragmatic Segmenter does not have a built in rule
|
83
83
|
|
@@ -862,6 +862,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
862
862
|
**Version 0.3.17**
|
863
863
|
* Fix issue involving the HTML regex in the cleaner
|
864
864
|
|
865
|
+
**Version 0.3.18**
|
866
|
+
* Performance optimizations
|
867
|
+
|
865
868
|
## Contributing
|
866
869
|
|
867
870
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -1,4 +1,6 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
require 'unicode'
|
3
5
|
|
4
6
|
module PragmaticSegmenter
|
@@ -28,11 +30,12 @@ module PragmaticSegmenter
|
|
28
30
|
def search_for_abbreviations_in_string(txt)
|
29
31
|
original = txt.dup
|
30
32
|
downcased = Unicode::downcase(txt)
|
31
|
-
@language::Abbreviation::ABBREVIATIONS.each do |
|
32
|
-
|
33
|
-
|
33
|
+
@language::Abbreviation::ABBREVIATIONS.each do |abbreviation|
|
34
|
+
stripped = abbreviation.strip
|
35
|
+
next unless downcased.include?(stripped)
|
36
|
+
abbrev_match = original.scan(/(?:^|\s|\r|\n)#{Regexp.escape(stripped)}/i)
|
34
37
|
next if abbrev_match.empty?
|
35
|
-
next_word_start = /(?<=#{Regexp.escape(
|
38
|
+
next_word_start = /(?<=#{Regexp.escape(stripped)} ).{1}/
|
36
39
|
character_array = @text.scan(next_word_start)
|
37
40
|
abbrev_match.each_with_index do |am, index|
|
38
41
|
txt = scan_for_replacements(txt, am, index, character_array)
|
@@ -74,19 +77,11 @@ module PragmaticSegmenter
|
|
74
77
|
# and try to cover the words that most often start a
|
75
78
|
# sentence but could never follow one of the abbreviations below.
|
76
79
|
|
80
|
+
# Rubular: http://rubular.com/r/PkBQ3PVBS8
|
77
81
|
@language::AbbreviationReplacer::SENTENCE_STARTERS.each do |word|
|
78
82
|
escaped = Regexp.escape(word)
|
79
|
-
|
80
|
-
txt.gsub!(
|
81
|
-
txt.gsub!(/U∯K∯\s#{escaped}\s/, "U∯K\.\s#{escaped}\s")
|
82
|
-
txt.gsub!(/U\.K∯\s#{escaped}\s/, "U\.K\.\s#{escaped}\s")
|
83
|
-
txt.gsub!(/E∯U∯\s#{escaped}\s/, "E∯U\.\s#{escaped}\s")
|
84
|
-
txt.gsub!(/E\.U∯\s#{escaped}\s/, "E\.U\.\s#{escaped}\s")
|
85
|
-
txt.gsub!(/U∯S∯A∯\s#{escaped}\s/, "U∯S∯A\.\s#{escaped}\s")
|
86
|
-
txt.gsub!(/U\.S\.A∯\s#{escaped}\s/, "U\.S\.A\.\s#{escaped}\s")
|
87
|
-
txt.gsub!(/I∯\s#{escaped}\s/, "I\.\s#{escaped}\s")
|
88
|
-
txt.gsub!(/i.v∯\s#{escaped}\s/, "i\.v\.\s#{escaped}\s")
|
89
|
-
txt.gsub!(/I.V∯\s#{escaped}\s/, "I\.V\.\s#{escaped}\s")
|
83
|
+
regex = /(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯(?=\s#{escaped}\s)/
|
84
|
+
txt.gsub!(regex, '\1.')
|
90
85
|
end
|
91
86
|
txt
|
92
87
|
end
|
@@ -1,4 +1,6 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
require_relative 'cleaner/rules'
|
3
5
|
|
4
6
|
module PragmaticSegmenter
|
@@ -62,7 +64,7 @@ module PragmaticSegmenter
|
|
62
64
|
|
63
65
|
def replace_punctuation_in_brackets
|
64
66
|
@text.dup.gsub!(/\[(?:[^\]])*\]/) do |match|
|
65
|
-
@text.gsub!(/#{Regexp.escape(match)}/,
|
67
|
+
@text.gsub!(/#{Regexp.escape(match)}/, match.dup.gsub!(/\?/, '&ᓷ&')) if match.include?('?')
|
66
68
|
end
|
67
69
|
end
|
68
70
|
|
@@ -1,19 +1,20 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
require 'pragmatic_segmenter/punctuation_replacer'
|
3
5
|
|
4
6
|
module PragmaticSegmenter
|
5
7
|
# This class searches for exclamation points that
|
6
8
|
# are part of words and not ending punctuation and replaces them.
|
7
9
|
module ExclamationWords
|
8
|
-
|
10
|
+
EXCLAMATION_WORDS = %w[!Xũ !Kung ǃʼOǃKung !Xuun !Kung-Ekoka ǃHu ǃKhung ǃKu ǃung ǃXo ǃXû ǃXung ǃXũ !Xun Yahoo! Y!J Yum!].freeze
|
11
|
+
REGEXP = Regexp.new(EXCLAMATION_WORDS.map { |string| Regexp.escape(string) }.join('|'))
|
9
12
|
|
10
13
|
def self.apply_rules(text)
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
).replace
|
16
|
-
end
|
14
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
15
|
+
matches_array: text.scan(REGEXP),
|
16
|
+
text: text
|
17
|
+
).replace
|
17
18
|
end
|
18
19
|
end
|
19
20
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
module PragmaticSegmenter
|
4
5
|
# This class searches for a list within a string and adds
|
@@ -41,6 +42,10 @@ module PragmaticSegmenter
|
|
41
42
|
ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX =
|
42
43
|
/(?<=^)[a-z]\.|(?<=\A)[a-z]\.|(?<=\s)[a-z]\./i
|
43
44
|
|
45
|
+
# Rubular: http://rubular.com/r/GcnmQt4a3I
|
46
|
+
ROMAN_NUMERALS_IN_PARENTHESES =
|
47
|
+
/\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])/
|
48
|
+
|
44
49
|
attr_reader :text
|
45
50
|
def initialize(text:)
|
46
51
|
@text = Text.new(text)
|
@@ -54,12 +59,7 @@ module PragmaticSegmenter
|
|
54
59
|
end
|
55
60
|
|
56
61
|
def replace_parens
|
57
|
-
|
58
|
-
next unless text =~ /\(#{Regexp.escape(rm)}\)\s[A-Z]/
|
59
|
-
text.gsub!(/\(#{Regexp.escape(rm)}\)(?=\s[A-Z])/) do |match|
|
60
|
-
match.gsub!(/\(/, '&✂&').gsub!(/\)/, '&⌬&')
|
61
|
-
end
|
62
|
-
end
|
62
|
+
text.gsub!(ROMAN_NUMERALS_IN_PARENTHESES, '&✂&\1&⌬&'.freeze)
|
63
63
|
text
|
64
64
|
end
|
65
65
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
module PragmaticSegmenter
|
4
5
|
# This class replaces punctuation that is typically a sentence boundary
|
@@ -63,7 +64,7 @@ module PragmaticSegmenter
|
|
63
64
|
|
64
65
|
def sub_characters(string, char_a, char_b)
|
65
66
|
sub = string.gsub(char_a, char_b)
|
66
|
-
@text.gsub!(/#{Regexp.escape(string)}/,
|
67
|
+
@text.gsub!(/#{Regexp.escape(string)}/, sub)
|
67
68
|
sub
|
68
69
|
end
|
69
70
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.18
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -180,7 +180,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
180
180
|
version: '0'
|
181
181
|
requirements: []
|
182
182
|
rubyforge_project:
|
183
|
-
rubygems_version: 2.
|
183
|
+
rubygems_version: 2.4.1
|
184
184
|
signing_key:
|
185
185
|
specification_version: 4
|
186
186
|
summary: A rule-based sentence boundary detection gem that works out-of-the-box across
|