pragmatic_tokenizer 3.0.3 → 3.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5c5f17c8bf848baf655f32dcc9620fb00998ae55
4
- data.tar.gz: 6f95da8fd31c05d791d554fabb15a8350ad77d84
3
+ metadata.gz: 00a9402728c3f1aeb845232ec6dfa1abc536bca9
4
+ data.tar.gz: da25dbf18f4ebf7a780a4b0a862976b20d01eb39
5
5
  SHA512:
6
- metadata.gz: d27744d4a7f77c4102927ae2dd2df20fc66b6248b820156bc8712bf9676c65357a82dd7d7540138289e8bf0b6d0d60a1cd940bcc82807a30b4be083a33e28d56
7
- data.tar.gz: 20360686b10e8be67c410fe8f48f3c0b057780471cd6ea72b763e8cf0a44cad6a3ad4bfc01bdaad5d2cb1ac38162a5e78b7be52771f6932d9dfcb459f1ee0e42
6
+ metadata.gz: ced5cc4b72bba8a4d277c7453bda8aed9f82a35477a1e047348fa5f198e50647c6748bbff29484d2b002f28106afd8f5ce432c4d49142234dbf8370402f07004
7
+ data.tar.gz: ef1988c4325ea1bce415685848f5d666b79ddbde26680f6b9712cdbbffd8558573733ece8c57b29c739dcce335565b8f7dbf5c3b68e213585afb0d7644c9c06d
data/README.md CHANGED
@@ -103,7 +103,7 @@ options = {
103
103
 
104
104
  ##### `filter_languages`
105
105
  **default** = `nil`
106
- - You can pass an array of languages of which you would like to process abbreviations, stop words and contractions. This language can be indepedent of the language of the string you are tokenizing (for example your tex might be German but contain so English stop words that you want to remove). If you supply your own abbreviations, stop words or contractions they will be merged with the abbreviations, stop words and contractions of any languages you add in this option. You can pass an array of symbols or strings (i.e. `[:en, :de]` or `['en', 'de']`)
106
+ - You can pass an array of languages of which you would like to process abbreviations, stop words and contractions. This language can be indepedent of the language of the string you are tokenizing (for example your text might be German but contain some English stop words that you want to remove). If you supply your own abbreviations, stop words or contractions they will be merged with the abbreviations, stop words and contractions of any languages you add in this option. You can pass an array of symbols or strings (i.e. `[:en, :de]` or `['en', 'de']`)
107
107
 
108
108
  <hr>
109
109
 
@@ -5,11 +5,10 @@ module PragmaticTokenizer
5
5
  # periods that are part of an abbreviation
6
6
  class FullStopSeparator
7
7
 
8
- REGEXP_ENDS_WITH_DOT = /\A(.+)\.\z/
9
- REGEXP_ONLY_LETTERS = /\A[a-z]\z/i
10
- REGEXP_UNKNOWN1 = /[a-z](?:\.[a-z])+\z/i
11
- REGEXP_UNKNOWN2 = /\A(.*\w)\.\z/
12
- DOT = '.'.freeze
8
+ REGEXP_ENDS_WITH_DOT = /\A(.*\w)\.\z/
9
+ REGEXP_ONLY_LETTERS = /\A[a-z]\z/i
10
+ REGEXP_ABBREVIATION = /[a-z](?:\.[a-z])+\z/i
11
+ DOT = '.'.freeze
13
12
 
14
13
  def initialize(tokens:, abbreviations:, downcase:)
15
14
  @tokens = tokens
@@ -30,7 +29,7 @@ module PragmaticTokenizer
30
29
  @tokens.each_with_index do |token, position|
31
30
  if @tokens[position + 1] && token =~ REGEXP_ENDS_WITH_DOT
32
31
  match = Regexp.last_match(1)
33
- if unknown_method1(match)
32
+ if abbreviation?(match)
34
33
  @cleaned_tokens += [match, DOT]
35
34
  next
36
35
  end
@@ -39,11 +38,11 @@ module PragmaticTokenizer
39
38
  end
40
39
  end
41
40
 
42
- def unknown_method1(token)
43
- !abbreviation?(token) && token !~ REGEXP_ONLY_LETTERS && token !~ REGEXP_UNKNOWN1
41
+ def abbreviation?(token)
42
+ !defined_abbreviation?(token) && token !~ REGEXP_ONLY_LETTERS && token !~ REGEXP_ABBREVIATION
44
43
  end
45
44
 
46
- def abbreviation?(token)
45
+ def defined_abbreviation?(token)
47
46
  @abbreviations.include?(inverse_case(token))
48
47
  end
49
48
 
@@ -53,7 +52,7 @@ module PragmaticTokenizer
53
52
 
54
53
  def replace_last_token
55
54
  last_token = @cleaned_tokens[-1]
56
- return if abbreviation?(last_token.chomp(DOT)) || last_token !~ REGEXP_UNKNOWN2
55
+ return if defined_abbreviation?(last_token.chomp(DOT)) || last_token !~ REGEXP_ENDS_WITH_DOT
57
56
  @cleaned_tokens[-1] = Regexp.last_match(1)
58
57
  @cleaned_tokens << DOT
59
58
  end
@@ -3,7 +3,7 @@ module PragmaticTokenizer
3
3
  module English
4
4
  include Languages::Common
5
5
  ABBREVIATIONS = Set.new(["adj", "adm", "adv", "al", "ala", "alta", "apr", "arc", "ariz", "ark", "art", "assn", "asst", "attys", "aug", "ave", "bart", "bld", "bldg", "blvd", "brig", "bros", "btw", "cal", "calif", "capt", "cl", "cmdr", "co", "col", "colo", "comdr", "con", "conn", "corp", "cpl", "cres", "ct", "d.phil", "dak", "dec", "del", "dept", "det", "dist", "dr", "dr.phil", "dr.philos", "drs", "e.g", "ens", "esp", "esq", "etc", "exp", "expy", "ext", "feb", "fed", "fla", "ft", "fwy", "fy", "ga", "gen", "gov", "hon", "hosp", "hr", "hway", "hwy", "i.e", "i.b.m", "ia", "id", "ida", "ill", "inc", "ind", "ing", "insp", "jan", "jr", "jul", "jun", "kan", "kans", "ken", "ky", "la", "lt", "ltd", "maj", "man", "mar", "mass", "may", "md", "me", "med", "messrs", "mex", "mfg", "mich", "min", "minn", "miss", "mlle", "mm", "mme", "mo", "mont", "mr", "mrs", "ms", "msgr", "mssrs", "mt", "mtn", "neb", "nebr", "nev", "no", "nos", "nov", "nr", "oct", "ok", "okla", "ont", "op", "ord", "ore", "p", "pa", "pd", "pde", "penn", "penna", "pfc", "ph", "ph.d", "pl", "plz", "pp", "prof", "pvt", "que", "rd", "ref", "rep", "reps", "res", "rev", "rt", "sask", "sec", "sen", "sens", "sep", "sept", "sfc", "sgt", "sr", "st", "supt", "surg", "tce", "tenn", "tex", "u.s", "u.s.a", "univ", "usafa", "ut", "v", "va", "ver", "vs", "vt", "wash", "wis", "wisc", "wy", "wyo", "yuk"]).freeze
6
- STOP_WORDS = Set.new(["&#;f", "'ll", "'ve", "+//", "-/+", "</li>", "</p>", "</td>", "<br", "<br/>", "<br/><br/>", "<li>", "<p>", "<sup></sup>", "<sup></sup></li>", "<td", "<td>", "a", "a's", "able", "about", "above", "abroad", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "adopted", "affected", "affecting", "affects", "after", "afterwards", "again", "against", "ago", "ah", "ahead", "ain't", "all", "allow", "allows", "almost", "alone", "along", "alongside", "already", "also", "although", "always", "am", "amid", "amidst", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "are", "aren", "aren't", "arent", "arise", "around", "as", "aside", "ask", "asking", "associated", "at", "auth", "available", "away", "awfully", "b", "back", "backward", "backwards", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bill", "biol", "both", "bottom", "brief", "briefly", "but", "by", "c", "c'mon", "c's", "ca", "call", "came", "can", "can't", "cannot", "cant", "caption", "cause", "causes", "certain", "certainly", "changes", "class=", "clearly", "co", "co.", "com", "come", "comes", "computer", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn't", "couldnt", "course", "cry", "currently", "d", "dare", "daren't", "date", "de", "definitely", "describe", "described", "despite", "detail", "did", "didn't", "different", "directly", "do", "does", "doesn't", "doing", "don't", "done", "down", "downwards", "due", "during", "e", "each", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "eleven", "else", "elsewhere", "empty", "end", "ending", "enough", "entirely", "especially", "et", "et-al", "etc", "even", "ever", "evermore", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "f", "fairly", "far", "farther", "few", "fewer", "ff", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "followed", "following", "follows", "for", "forever", "former", "formerly", "forth", "forty", "forward", "found", "four", "from", "front", "full", "further", "furthermore", "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't", "half", "happens", "hardly", "has", "hasn't", "hasnt", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "hed", "hello", "help", "hence", "her", "here", "here's", "hereafter", "hereby", "herein", "heres", "hereupon", "hers", "herself", "hes", "hi", "hid", "him", "himself", "his", "hither", "home", "hopefully", "how", "how's", "howbeit", "however", "http", "https", "hundred", "i", "i'd", "i'll", "i'm", "i've", "id", "ie", "if", "ignored", "im", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "inc.", "indeed", "index", "indicate", "indicated", "indicates", "information", "ing", "inner", "inside", "insofar", "instead", "interest", "into", "invention", "inward", "is", "isn't", "it", "it'd", "it'll", "it's", "itd", "its", "itself", "j", "just", "k", "keep", "keeps", "kept", "keys", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "let's", "lets", "like", "liked", "likely", "likewise", "line", "little", "look", "looking", "looks", "low", "lower", "ltd", "m", "made", "mainly", "make", "makes", "many", "may", "maybe", "mayn't", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn't", "mill", "million", "mine", "minus", "miss", "ml", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "much", "mug", "must", "mustn't", "my", "myself", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needn't", "needs", "neither", "never", "neverf", "neverless", "nevertheless", "new", "next", "nine", "ninety", "no", "no-one", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "notwithstanding", "novel", "now", "nowhere", "o", "obtain", "obtained", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "omitted", "on", "once", "one", "one's", "ones", "only", "onto", "opposite", "or", "ord", "other", "others", "otherwise", "ought", "oughtn't", "our", "ours", "ours", "ourselves", "out", "outside", "over", "overall", "owing", "own", "p", "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provided", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "round", "run", "s", "said", "same", "saw", "say", "saying", "says", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "shan't", "she", "she'd", "she'll", "she's", "shed", "shes", "should", "shouldn't", "show", "showed", "shown", "showns", "shows", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "slightly", "so", "some", "somebody", "someday", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "state", "states", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "system", "t", "t's", "take", "taken", "taking", "tell", "ten", "tends", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "that's", "that've", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "there'd", "there'll", "there're", "there's", "there've", "thereafter", "thereby", "thered", "therefore", "therein", "thereof", "therere", "theres", "thereto", "thereupon", "these", "they", "they'd", "they'll", "they're", "they've", "theyd", "theyre", "thick", "thin", "thing", "things", "think", "third", "thirty", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "til", "till", "tip", "to", "together", "too", "took", "top", "toward", "towards", "tried", "tries", "truly", "try", "trying", "ts", "twelve", "twenty", "twice", "two", "u", "un", "under", "underneath", "undoing", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "up", "upon", "ups", "upwards", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "uucp", "v", "value", "various", "versus", "very", "via", "viz", "vol", "vols", "vs", "w", "want", "wants", "was", "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "wed", "welcome", "well", "went", "were", "weren't", "what", "what'll", "what's", "what're", "what've", "whatever", "whats", "when", "when's", "whence", "whenever", "where", "where's", "whereafter", "whereas", "whereby", "wherein", "wheres", "whereupon", "wherever", "whether", "which", "whichever", "while", "whilst", "whim", "whither", "who", "who'd", "who'll", "who's", "whod", "whoever", "whole", "whom", "whomever", "whos", "whose", "why", "why's", "widely", "will", "willing", "wish", "with", "within", "without", "won't", "wonder", "word", "words", "world", "would", "wouldn't", "www", "x", "y", "yes", "yet", "you", "you'd", "you'll", "you're", "you've", "youd", "your", "youre", "yours", "yourself", "yourselves", "z", "zero"]).freeze
6
+ STOP_WORDS = Set.new(["i.e.", "e.g.", "&#;f", "'ll", "'ve", "+//", "-/+", "</li>", "</p>", "</td>", "<br", "<br/>", "<br/><br/>", "<li>", "<p>", "<sup></sup>", "<sup></sup></li>", "<td", "<td>", "a", "a's", "able", "about", "above", "abroad", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "adopted", "affected", "affecting", "affects", "after", "afterwards", "again", "against", "ago", "ah", "ahead", "ain't", "all", "allow", "allows", "almost", "alone", "along", "alongside", "already", "also", "although", "always", "am", "amid", "amidst", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "are", "aren", "aren't", "arent", "arise", "around", "as", "aside", "ask", "asking", "associated", "at", "auth", "available", "away", "awfully", "b", "back", "backward", "backwards", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bill", "biol", "both", "bottom", "brief", "briefly", "but", "by", "c", "c'mon", "c's", "ca", "call", "came", "can", "can't", "cannot", "cant", "caption", "cause", "causes", "certain", "certainly", "changes", "class=", "clearly", "co", "co.", "com", "come", "comes", "computer", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn't", "couldnt", "course", "cry", "currently", "d", "dare", "daren't", "date", "de", "definitely", "describe", "described", "despite", "detail", "did", "didn't", "different", "directly", "do", "does", "doesn't", "doing", "don't", "done", "down", "downwards", "due", "during", "e", "each", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "eleven", "else", "elsewhere", "empty", "end", "ending", "enough", "entirely", "especially", "et", "et-al", "etc", "even", "ever", "evermore", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "f", "fairly", "far", "farther", "few", "fewer", "ff", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "followed", "following", "follows", "for", "forever", "former", "formerly", "forth", "forty", "forward", "found", "four", "from", "front", "full", "further", "furthermore", "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't", "half", "happens", "hardly", "has", "hasn't", "hasnt", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "hed", "hello", "help", "hence", "her", "here", "here's", "hereafter", "hereby", "herein", "heres", "hereupon", "hers", "herself", "hes", "hi", "hid", "him", "himself", "his", "hither", "home", "hopefully", "how", "how's", "howbeit", "however", "http", "https", "hundred", "i", "i'd", "i'll", "i'm", "i've", "id", "ie", "if", "ignored", "im", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "inc.", "indeed", "index", "indicate", "indicated", "indicates", "information", "ing", "inner", "inside", "insofar", "instead", "interest", "into", "invention", "inward", "is", "isn't", "it", "it'd", "it'll", "it's", "itd", "its", "itself", "j", "just", "k", "keep", "keeps", "kept", "keys", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "let's", "lets", "like", "liked", "likely", "likewise", "line", "little", "look", "looking", "looks", "low", "lower", "ltd", "m", "made", "mainly", "make", "makes", "many", "may", "maybe", "mayn't", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn't", "mill", "million", "mine", "minus", "miss", "ml", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "much", "mug", "must", "mustn't", "my", "myself", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needn't", "needs", "neither", "never", "neverf", "neverless", "nevertheless", "new", "next", "nine", "ninety", "no", "no-one", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "notwithstanding", "novel", "now", "nowhere", "o", "obtain", "obtained", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "omitted", "on", "once", "one", "one's", "ones", "only", "onto", "opposite", "or", "ord", "other", "others", "otherwise", "ought", "oughtn't", "our", "ours", "ours", "ourselves", "out", "outside", "over", "overall", "owing", "own", "p", "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provided", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "round", "run", "s", "said", "same", "saw", "say", "saying", "says", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "shan't", "she", "she'd", "she'll", "she's", "shed", "shes", "should", "shouldn't", "show", "showed", "shown", "showns", "shows", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "slightly", "so", "some", "somebody", "someday", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "state", "states", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "system", "t", "t's", "take", "taken", "taking", "tell", "ten", "tends", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "that's", "that've", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "there'd", "there'll", "there're", "there's", "there've", "thereafter", "thereby", "thered", "therefore", "therein", "thereof", "therere", "theres", "thereto", "thereupon", "these", "they", "they'd", "they'll", "they're", "they've", "theyd", "theyre", "thick", "thin", "thing", "things", "think", "third", "thirty", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "til", "till", "tip", "to", "together", "too", "took", "top", "toward", "towards", "tried", "tries", "truly", "try", "trying", "ts", "twelve", "twenty", "twice", "two", "u", "un", "under", "underneath", "undoing", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "up", "upon", "ups", "upwards", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "uucp", "v", "value", "various", "versus", "very", "via", "viz", "vol", "vols", "vs", "w", "want", "wants", "was", "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "wed", "welcome", "well", "went", "were", "weren't", "what", "what'll", "what's", "what're", "what've", "whatever", "whats", "when", "when's", "whence", "whenever", "where", "where's", "whereafter", "whereas", "whereby", "wherein", "wheres", "whereupon", "wherever", "whether", "which", "whichever", "while", "whilst", "whim", "whither", "who", "who'd", "who'll", "who's", "whod", "whoever", "whole", "whom", "whomever", "whos", "whose", "why", "why's", "widely", "will", "willing", "wish", "with", "within", "without", "won't", "wonder", "word", "words", "world", "would", "wouldn't", "www", "x", "y", "yes", "yet", "you", "you'd", "you'll", "you're", "you've", "youd", "your", "youre", "yours", "yourself", "yourselves", "z", "zero"]).freeze
7
7
  # N.B. Some English contractions are ambigous (i.e. "she's" can mean "she has" or "she is").
8
8
  # Pragmatic Tokenizer will return the most frequently appearing expanded contraction. Regardless, this should
9
9
  # be rather insignificant as in most cases one is probably removing stop words.
@@ -31,12 +31,12 @@ module PragmaticTokenizer
31
31
  end
32
32
 
33
33
  def post_process
34
- separate_ending_punctuation(method_name3)
34
+ separate_ending_punctuation(post_process_punctuation)
35
35
  end
36
36
 
37
37
  private
38
38
 
39
- def method_name3
39
+ def post_process_punctuation
40
40
  separated = separate_ending_punctuation(full_stop_separated_tokens)
41
41
  procs = [unified1, split_unknown_period1, split_unknown_period2, split_emoji]
42
42
  procs.reduce(separated) { |a, e| a.flat_map(&e) }
@@ -20,7 +20,8 @@ module PragmaticTokenizer
20
20
  REGEX_DOMAIN = /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix
21
21
  REGEX_URL = /(http|https)(\.|:)/
22
22
  REGEX_HYPHEN = /\-/
23
- REGEX_UNDERSCORE = /\_/
23
+ REGEX_LONG_WORD = /\-|\_/
24
+ REGEXP_SPLIT_CHECK = /@|@|(http)/
24
25
  REGEX_CONTRACTIONS = /[‘’‚‛‹›'´`]/
25
26
  REGEX_APOSTROPHE_S = /['’`́]s$/
26
27
  REGEX_EMAIL = /\S+(@|@)\S+\.\S+/
@@ -126,7 +127,7 @@ module PragmaticTokenizer
126
127
  # TODO: why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
127
128
  @contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
128
129
  @abbreviations += @language_module::ABBREVIATIONS if @abbreviations.empty?
129
- @stop_words += @language_module::STOP_WORDS if @stop_words.empty? && @filter_languages.empty?
130
+ @stop_words += @language_module::STOP_WORDS if @stop_words.empty?
130
131
 
131
132
  @filter_languages.each do |lang|
132
133
  language = Languages.get_language_by_code(lang)
@@ -286,12 +287,11 @@ module PragmaticTokenizer
286
287
 
287
288
  def split_long_words!
288
289
  @tokens = @tokens
289
- .flat_map { |t| t.length > @long_word_split ? t.split(REGEX_HYPHEN) : t }
290
- .flat_map { |t| t.length > @long_word_split ? t.split(REGEX_UNDERSCORE) : t }
290
+ .flat_map { |t| (t.length > @long_word_split && t !~ REGEXP_SPLIT_CHECK ) ? t.split(REGEX_LONG_WORD) : t }
291
291
  end
292
292
 
293
- def chosen_case(token)
294
- @downcase ? Unicode.downcase(token) : token
293
+ def chosen_case(txt)
294
+ @downcase ? Unicode.downcase(txt) : txt
295
295
  end
296
296
 
297
297
  def inverse_case(token)
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "3.0.3".freeze
2
+ VERSION = "3.0.4".freeze
3
3
  end
@@ -175,7 +175,7 @@ describe PragmaticTokenizer do
175
175
  remove_stop_words: true,
176
176
  language: 'de'
177
177
  )
178
- expect(pt.tokenize(text)).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "english", "."])
178
+ expect(pt.tokenize(text)).to eq(["lehrer_in", "schüler_innen", ".", "english", "."])
179
179
  end
180
180
 
181
181
  it 'removes English and German stopwords' do
@@ -443,6 +443,17 @@ describe PragmaticTokenizer do
443
443
  end
444
444
  end
445
445
 
446
+ context 'option (downcase)' do
447
+ it 'does not downcase URLs' do
448
+ skip "NOT IMPLEMENTED"
449
+ text = "Here are some domains and urls GOOGLE.com http://test.com/UPPERCASE."
450
+ pt = PragmaticTokenizer::Tokenizer.new(
451
+ downcase: :true
452
+ )
453
+ expect(pt.tokenize(text)).to eq(["here", "are", "some", "domains", "and", "urls", "GOOGLE.com", "http://test.com/UPPERCASE", "."])
454
+ end
455
+ end
456
+
446
457
  context 'option (domains)' do
447
458
  it 'tokenizes a string #001' do
448
459
  text = "Here are some domains and urls google.com https://www.google.com www.google.com."
@@ -485,6 +496,38 @@ describe PragmaticTokenizer do
485
496
  end
486
497
 
487
498
  context 'option (long_word_split)' do
499
+ it 'should not split twitter handles' do
500
+ text = "@john_doe"
501
+ pt = PragmaticTokenizer::Tokenizer.new(
502
+ long_word_split: 5
503
+ )
504
+ expect(pt.tokenize(text)).to eq(["@john_doe"])
505
+ end
506
+
507
+ it 'should not split emails' do
508
+ text = "john_doe@something.com"
509
+ pt = PragmaticTokenizer::Tokenizer.new(
510
+ long_word_split: 5
511
+ )
512
+ expect(pt.tokenize(text)).to eq(["john_doe@something.com"])
513
+ end
514
+
515
+ it 'should not split emails 2' do
516
+ text = "john_doe@something.com"
517
+ pt = PragmaticTokenizer::Tokenizer.new(
518
+ long_word_split: 5
519
+ )
520
+ expect(pt.tokenize(text)).to eq(["john_doe@something.com"])
521
+ end
522
+
523
+ it 'should not split urls' do
524
+ text = "http://test.com/some_path"
525
+ pt = PragmaticTokenizer::Tokenizer.new(
526
+ long_word_split: 5
527
+ )
528
+ expect(pt.tokenize(text)).to eq(["http://test.com/some_path"])
529
+ end
530
+
488
531
  it 'tokenizes a string #001' do
489
532
  text = "Some main-categories of the mathematics-test have sub-examples that most 14-year olds can't answer, therefor the implementation-instruction made in the 1990-years needs to be revised."
490
533
  pt = PragmaticTokenizer::Tokenizer.new(
@@ -1041,6 +1084,15 @@ describe PragmaticTokenizer do
1041
1084
  expect(pt.tokenize(text)).to eq(["short", "sentence", "explanations", "."])
1042
1085
  end
1043
1086
 
1087
+ it 'removes stop words 2' do
1088
+ text = 'This is a short sentence with explanations and stop words i.e. is a stop word as so is e.g. I think.'
1089
+ pt = PragmaticTokenizer::Tokenizer.new(
1090
+ language: 'en',
1091
+ remove_stop_words: true
1092
+ )
1093
+ expect(pt.tokenize(text)).to eq(["short", "sentence", "explanations", "."])
1094
+ end
1095
+
1044
1096
  it 'removes user-supplied stop words' do
1045
1097
  text = 'This is a short sentence with explanations and stop words.'
1046
1098
  pt = PragmaticTokenizer::Tokenizer.new(
@@ -29,6 +29,7 @@ describe PragmaticTokenizer do
29
29
  # 24.2
30
30
  # 23.2
31
31
  # 11.6
32
+ # 12.0
32
33
  # it 'is fast? (long strings)' do
33
34
  # string = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 1000
34
35
  # puts "LENGTH: #{string.length}"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.3
4
+ version: 3.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-16 00:00:00.000000000 Z
11
+ date: 2016-03-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode