pragmatic_segmenter 0.3.17 → 0.3.18
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/NEWS +4 -0
- data/README.md +4 -1
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +10 -15
- data/lib/pragmatic_segmenter/between_punctuation.rb +1 -0
- data/lib/pragmatic_segmenter/cleaner.rb +3 -1
- data/lib/pragmatic_segmenter/cleaner/rules.rb +2 -0
- data/lib/pragmatic_segmenter/exclamation_words.rb +8 -7
- data/lib/pragmatic_segmenter/languages.rb +2 -0
- data/lib/pragmatic_segmenter/languages/amharic.rb +2 -0
- data/lib/pragmatic_segmenter/languages/arabic.rb +2 -0
- data/lib/pragmatic_segmenter/languages/armenian.rb +2 -0
- data/lib/pragmatic_segmenter/languages/bulgarian.rb +2 -0
- data/lib/pragmatic_segmenter/languages/burmese.rb +2 -0
- data/lib/pragmatic_segmenter/languages/chinese.rb +2 -0
- data/lib/pragmatic_segmenter/languages/common.rb +2 -0
- data/lib/pragmatic_segmenter/languages/common/ellipsis.rb +1 -0
- data/lib/pragmatic_segmenter/languages/common/numbers.rb +1 -0
- data/lib/pragmatic_segmenter/languages/danish.rb +2 -0
- data/lib/pragmatic_segmenter/languages/deutsch.rb +2 -0
- data/lib/pragmatic_segmenter/languages/dutch.rb +2 -0
- data/lib/pragmatic_segmenter/languages/english.rb +2 -0
- data/lib/pragmatic_segmenter/languages/french.rb +2 -0
- data/lib/pragmatic_segmenter/languages/greek.rb +2 -0
- data/lib/pragmatic_segmenter/languages/hindi.rb +2 -0
- data/lib/pragmatic_segmenter/languages/italian.rb +2 -0
- data/lib/pragmatic_segmenter/languages/japanese.rb +2 -0
- data/lib/pragmatic_segmenter/languages/persian.rb +2 -0
- data/lib/pragmatic_segmenter/languages/polish.rb +2 -0
- data/lib/pragmatic_segmenter/languages/russian.rb +2 -0
- data/lib/pragmatic_segmenter/languages/spanish.rb +2 -0
- data/lib/pragmatic_segmenter/languages/urdu.rb +2 -0
- data/lib/pragmatic_segmenter/list.rb +6 -6
- data/lib/pragmatic_segmenter/processor.rb +2 -0
- data/lib/pragmatic_segmenter/punctuation_replacer.rb +2 -1
- data/lib/pragmatic_segmenter/segmenter.rb +2 -0
- data/lib/pragmatic_segmenter/types.rb +2 -0
- data/lib/pragmatic_segmenter/version.rb +3 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3bb581e56e988521adc41dbb94bc7281ee7dfa95
|
4
|
+
data.tar.gz: 0c3b6fe877a5d39d36053b7ed68a860b5d17779b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b1dd64b5a382e8bb7ed5d79fbb9264565d71088f234ba4a9bd7cae47184e1c64f78b32a72f39326aa31936c6c6742aa2ff75cd75cd1b328987a6061a4d2534b
|
7
|
+
data.tar.gz: c150b178c93b7183300559e89c117cf8f9f93adf6ef33790a3ce0b292c66588fe6461c724210f7f93e078d2d08a10e995d7234bad963f8d5aa1c52c378effe5e
|
data/NEWS
CHANGED
data/README.md
CHANGED
@@ -77,7 +77,7 @@ Pragmatic Segmenter is opinionated and made for the explicit purpose of segmenti
|
|
77
77
|
Pragmatic Segmenter is specifically used for the purpose of segmenting texts for use in translation (and translation memory) related applications. Therefore Pragmatic Segmenter takes a stance on some formatting and segmentation gray areas with the goal of improving the segmentation for the above stated purpose. Some examples:
|
78
78
|
|
79
79
|
- Removes 'table of contents' style long string of periods ('............')
|
80
|
-
- Keeps
|
80
|
+
- Keeps parentheticals, quotations, and parentheticals or quotations within a sentence as one segment for clarity even though technically there may be multiple grammatical sentences within the segment
|
81
81
|
- Strips out any xhtml code
|
82
82
|
- Conservative in cases where the sentence boundary is ambigious and Pragmatic Segmenter does not have a built in rule
|
83
83
|
|
@@ -862,6 +862,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
862
862
|
**Version 0.3.17**
|
863
863
|
* Fix issue involving the HTML regex in the cleaner
|
864
864
|
|
865
|
+
**Version 0.3.18**
|
866
|
+
* Performance optimizations
|
867
|
+
|
865
868
|
## Contributing
|
866
869
|
|
867
870
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -1,4 +1,6 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
require 'unicode'
|
3
5
|
|
4
6
|
module PragmaticSegmenter
|
@@ -28,11 +30,12 @@ module PragmaticSegmenter
|
|
28
30
|
def search_for_abbreviations_in_string(txt)
|
29
31
|
original = txt.dup
|
30
32
|
downcased = Unicode::downcase(txt)
|
31
|
-
@language::Abbreviation::ABBREVIATIONS.each do |
|
32
|
-
|
33
|
-
|
33
|
+
@language::Abbreviation::ABBREVIATIONS.each do |abbreviation|
|
34
|
+
stripped = abbreviation.strip
|
35
|
+
next unless downcased.include?(stripped)
|
36
|
+
abbrev_match = original.scan(/(?:^|\s|\r|\n)#{Regexp.escape(stripped)}/i)
|
34
37
|
next if abbrev_match.empty?
|
35
|
-
next_word_start = /(?<=#{Regexp.escape(
|
38
|
+
next_word_start = /(?<=#{Regexp.escape(stripped)} ).{1}/
|
36
39
|
character_array = @text.scan(next_word_start)
|
37
40
|
abbrev_match.each_with_index do |am, index|
|
38
41
|
txt = scan_for_replacements(txt, am, index, character_array)
|
@@ -74,19 +77,11 @@ module PragmaticSegmenter
|
|
74
77
|
# and try to cover the words that most often start a
|
75
78
|
# sentence but could never follow one of the abbreviations below.
|
76
79
|
|
80
|
+
# Rubular: http://rubular.com/r/PkBQ3PVBS8
|
77
81
|
@language::AbbreviationReplacer::SENTENCE_STARTERS.each do |word|
|
78
82
|
escaped = Regexp.escape(word)
|
79
|
-
|
80
|
-
txt.gsub!(
|
81
|
-
txt.gsub!(/U∯K∯\s#{escaped}\s/, "U∯K\.\s#{escaped}\s")
|
82
|
-
txt.gsub!(/U\.K∯\s#{escaped}\s/, "U\.K\.\s#{escaped}\s")
|
83
|
-
txt.gsub!(/E∯U∯\s#{escaped}\s/, "E∯U\.\s#{escaped}\s")
|
84
|
-
txt.gsub!(/E\.U∯\s#{escaped}\s/, "E\.U\.\s#{escaped}\s")
|
85
|
-
txt.gsub!(/U∯S∯A∯\s#{escaped}\s/, "U∯S∯A\.\s#{escaped}\s")
|
86
|
-
txt.gsub!(/U\.S\.A∯\s#{escaped}\s/, "U\.S\.A\.\s#{escaped}\s")
|
87
|
-
txt.gsub!(/I∯\s#{escaped}\s/, "I\.\s#{escaped}\s")
|
88
|
-
txt.gsub!(/i.v∯\s#{escaped}\s/, "i\.v\.\s#{escaped}\s")
|
89
|
-
txt.gsub!(/I.V∯\s#{escaped}\s/, "I\.V\.\s#{escaped}\s")
|
83
|
+
regex = /(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯(?=\s#{escaped}\s)/
|
84
|
+
txt.gsub!(regex, '\1.')
|
90
85
|
end
|
91
86
|
txt
|
92
87
|
end
|
@@ -1,4 +1,6 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
require_relative 'cleaner/rules'
|
3
5
|
|
4
6
|
module PragmaticSegmenter
|
@@ -62,7 +64,7 @@ module PragmaticSegmenter
|
|
62
64
|
|
63
65
|
def replace_punctuation_in_brackets
|
64
66
|
@text.dup.gsub!(/\[(?:[^\]])*\]/) do |match|
|
65
|
-
@text.gsub!(/#{Regexp.escape(match)}/,
|
67
|
+
@text.gsub!(/#{Regexp.escape(match)}/, match.dup.gsub!(/\?/, '&ᓷ&')) if match.include?('?')
|
66
68
|
end
|
67
69
|
end
|
68
70
|
|
@@ -1,19 +1,20 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
require 'pragmatic_segmenter/punctuation_replacer'
|
3
5
|
|
4
6
|
module PragmaticSegmenter
|
5
7
|
# This class searches for exclamation points that
|
6
8
|
# are part of words and not ending punctuation and replaces them.
|
7
9
|
module ExclamationWords
|
8
|
-
|
10
|
+
EXCLAMATION_WORDS = %w[!Xũ !Kung ǃʼOǃKung !Xuun !Kung-Ekoka ǃHu ǃKhung ǃKu ǃung ǃXo ǃXû ǃXung ǃXũ !Xun Yahoo! Y!J Yum!].freeze
|
11
|
+
REGEXP = Regexp.new(EXCLAMATION_WORDS.map { |string| Regexp.escape(string) }.join('|'))
|
9
12
|
|
10
13
|
def self.apply_rules(text)
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
).replace
|
16
|
-
end
|
14
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
15
|
+
matches_array: text.scan(REGEXP),
|
16
|
+
text: text
|
17
|
+
).replace
|
17
18
|
end
|
18
19
|
end
|
19
20
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
module PragmaticSegmenter
|
4
5
|
# This class searches for a list within a string and adds
|
@@ -41,6 +42,10 @@ module PragmaticSegmenter
|
|
41
42
|
ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX =
|
42
43
|
/(?<=^)[a-z]\.|(?<=\A)[a-z]\.|(?<=\s)[a-z]\./i
|
43
44
|
|
45
|
+
# Rubular: http://rubular.com/r/GcnmQt4a3I
|
46
|
+
ROMAN_NUMERALS_IN_PARENTHESES =
|
47
|
+
/\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])/
|
48
|
+
|
44
49
|
attr_reader :text
|
45
50
|
def initialize(text:)
|
46
51
|
@text = Text.new(text)
|
@@ -54,12 +59,7 @@ module PragmaticSegmenter
|
|
54
59
|
end
|
55
60
|
|
56
61
|
def replace_parens
|
57
|
-
|
58
|
-
next unless text =~ /\(#{Regexp.escape(rm)}\)\s[A-Z]/
|
59
|
-
text.gsub!(/\(#{Regexp.escape(rm)}\)(?=\s[A-Z])/) do |match|
|
60
|
-
match.gsub!(/\(/, '&✂&').gsub!(/\)/, '&⌬&')
|
61
|
-
end
|
62
|
-
end
|
62
|
+
text.gsub!(ROMAN_NUMERALS_IN_PARENTHESES, '&✂&\1&⌬&'.freeze)
|
63
63
|
text
|
64
64
|
end
|
65
65
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
module PragmaticSegmenter
|
4
5
|
# This class replaces punctuation that is typically a sentence boundary
|
@@ -63,7 +64,7 @@ module PragmaticSegmenter
|
|
63
64
|
|
64
65
|
def sub_characters(string, char_a, char_b)
|
65
66
|
sub = string.gsub(char_a, char_b)
|
66
|
-
@text.gsub!(/#{Regexp.escape(string)}/,
|
67
|
+
@text.gsub!(/#{Regexp.escape(string)}/, sub)
|
67
68
|
sub
|
68
69
|
end
|
69
70
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.18
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -180,7 +180,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
180
180
|
version: '0'
|
181
181
|
requirements: []
|
182
182
|
rubyforge_project:
|
183
|
-
rubygems_version: 2.
|
183
|
+
rubygems_version: 2.4.1
|
184
184
|
signing_key:
|
185
185
|
specification_version: 4
|
186
186
|
summary: A rule-based sentence boundary detection gem that works out-of-the-box across
|