RubyGems - pragmatic_tokenizer - Versions diffs - 0.4.0 → 0.4.1 - Mend

pragmatic_tokenizer 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/README.md +6 -0
data/lib/pragmatic_tokenizer/languages/common.rb +0 -1
data/lib/pragmatic_tokenizer/languages/english.rb +3 -1
data/lib/pragmatic_tokenizer/languages/french.rb +0 -1
data/lib/pragmatic_tokenizer/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 2f8b5c190992003a3b48fc266d801c818d02d928
-  data.tar.gz: d8782feae2c911fc4fb8495f04aff32bd95ac45a
+  metadata.gz: 8967535413a097c13a81bfd584401980223cb98b
+  data.tar.gz: 085a2582c4eed4cca4cd7102f141ccdd05994230
 SHA512:
-  metadata.gz: 28dcbe0dcfddb462fd2273bbb41d3e3a9c89e6ed74aeafda8e2776e5ac7ba6b4d94d958b6c38f7738779e7ffc99568fdb46df9b7c664dbef4fb750e427d96af4
-  data.tar.gz: 8c3954220a4f35efbf883bf82338e1e67b4f1eb2fe574c7f9f3c825a46e8ef542d552172da3c09224360903673660d5cbd5282611475510dbf5a58a72c74af66
+  metadata.gz: 70c4d278a74ca66cc8728992009f2701f661f8fc52e0c474f39c98abc8e0253ae5531b23f2b5bd0697d03a1c555ffdcad9472944a6146d0796d83c0a64fcfc72
+  data.tar.gz: 0940fba61b00d1f4d603a38dc7547f11fa3e08e05fbb316eb861f8517cc45396bc3e9f43dc7fa31dae60406b89b04c11e6ce539073abcd8be0f48e8a1f555ba4

data/README.md CHANGED

@@ -378,6 +378,12 @@ Abbreviations: No
 Stop Words: Yes
 Contractions: No
+## Resources
+* [The Art of Tokenization](https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en)
+* [Handbook Of Natural Language Processing Second Edition](https://karczmarczuk.users.greyc.fr/TEACH/TAL/Doc/Handbook%20Of%20Natural%20Language%20Processing,%20Second%20Edition%20Chapman%20&%20Hall%20Crc%20Machine%20Learning%20&%20Pattern%20Recognition%202010.pdf)
 ## Development
 After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.

data/lib/pragmatic_tokenizer/languages/common.rb CHANGED

@@ -12,7 +12,6 @@ module PragmaticTokenizer
       class SingleQuotes
         def handle_single_quotes(text)
-          text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
           text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
           # Separate right single quotes
           text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text

data/lib/pragmatic_tokenizer/languages/english.rb CHANGED

@@ -4,6 +4,9 @@ module PragmaticTokenizer
       include Languages::Common
       ABBREVIATIONS = ["adj", "adm", "adv", "al", "ala", "alta", "apr", "arc", "ariz", "ark", "art", "assn", "asst", "attys", "aug", "ave", "bart", "bld", "bldg", "blvd", "brig", "bros", "btw", "cal", "calif", "capt", "cl", "cmdr", "co", "col", "colo", "comdr", "con", "conn", "corp", "cpl", "cres", "ct", "d.phil", "dak", "dec", "del", "dept", "det", "dist", "dr", "dr.phil", "dr.philos", "drs", "e.g", "ens", "esp", "esq", "etc", "exp", "expy", "ext", "feb", "fed", "fla", "ft", "fwy", "fy", "ga", "gen", "gov", "hon", "hosp", "hr", "hway", "hwy", "i.e", "ia", "id", "ida", "ill", "inc", "ind", "ing", "insp", "is", "jan", "jr", "jul", "jun", "kan", "kans", "ken", "ky", "la", "lt", "ltd", "maj", "man", "mar", "mass", "may", "md", "me", "med", "messrs", "mex", "mfg", "mich", "min", "minn", "miss", "mlle", "mm", "mme", "mo", "mont", "mr", "mrs", "ms", "msgr", "mssrs", "mt", "mtn", "neb", "nebr", "nev", "no", "nos", "nov", "nr", "oct", "ok", "okla", "ont", "op", "ord", "ore", "p", "pa", "pd", "pde", "penn", "penna", "pfc", "ph", "ph.d", "pl", "plz", "pp", "prof", "pvt", "que", "rd", "ref", "rep", "reps", "res", "rev", "rt", "sask", "sec", "sen", "sens", "sep", "sept", "sfc", "sgt", "sr", "st", "supt", "surg", "tce", "tenn", "tex", "u.s", "univ", "usafa", "ut", "v", "va", "ver", "vs", "vt", "wash", "wis", "wisc", "wy", "wyo", "yuk"]
       STOP_WORDS = ["&#;f", "'ll", "'ve", "+//", "-/+", "</li>", "</p>", "</td>", "<br", "<br/>", "<br/><br/>", "<li>", "<p>", "<sup></sup>", "<sup></sup></li>", "<td", "<td>", "___", "____", "_____", "______", "_______", "________", "_________", "__________", "___________", "____________", "_____________", "______________", "a", "a's", "able", "about", "above", "abroad", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "adopted", "affected", "affecting", "affects", "after", "afterwards", "again", "against", "ago", "ah", "ahead", "ain't", "all", "allow", "allows", "almost", "alone", "along", "alongside", "already", "also", "although", "always", "am", "amid", "amidst", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "are", "aren", "aren't", "arent", "arise", "around", "as", "aside", "ask", "asking", "associated", "at", "auth", "available", "away", "awfully", "b", "back", "backward", "backwards", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bill", "biol", "both", "bottom", "brief", "briefly", "but", "by", "c", "c'mon", "c's", "ca", "call", "came", "can", "can't", "cannot", "cant", "caption", "cause", "causes", "certain", "certainly", "changes", "class=", "clearly", "co", "co.", "com", "come", "comes", "computer", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn't", "couldnt", "course", "cry", "currently", "d", "dare", "daren't", "date", "de", "definitely", "describe", "described", "despite", "detail", "did", "didn't", "different", "directly", "do", "does", "doesn't", "doing", "don't", "done", "down", "downwards", "due", "during", "e", "each", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "eleven", "else", "elsewhere", "empty", "end", "ending", "enough", "entirely", "especially", "et", "et-al", "etc", "even", "ever", "evermore", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "f", "fairly", "far", "farther", "few", "fewer", "ff", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "followed", "following", "follows", "for", "forever", "former", "formerly", "forth", "forty", "forward", "found", "four", "from", "front", "full", "further", "furthermore", "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't", "half", "happens", "hardly", "has", "hasn't", "hasnt", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "hed", "hello", "help", "hence", "her", "here", "here's", "hereafter", "hereby", "herein", "heres", "hereupon", "hers", "herself", "hes", "hi", "hid", "him", "himself", "his", "hither", "home", "hopefully", "how", "how's", "howbeit", "however", "http", "https", "hundred", "i", "i'd", "i'll", "i'm", "i've", "id", "ie", "if", "ignored", "im", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "inc.", "indeed", "index", "indicate", "indicated", "indicates", "information", "ing", "inner", "inside", "insofar", "instead", "interest", "into", "invention", "inward", "is", "isn't", "it", "it'd", "it'll", "it's", "itd", "its", "itself", "j", "just", "k", "keep", "keeps", "kept", "keys", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "let's", "lets", "like", "liked", "likely", "likewise", "line", "little", "look", "looking", "looks", "low", "lower", "ltd", "m", "made", "mainly", "make", "makes", "many", "may", "maybe", "mayn't", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn't", "mill", "million", "mine", "minus", "miss", "ml", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "much", "mug", "must", "mustn't", "my", "myself", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needn't", "needs", "neither", "never", "neverf", "neverless", "nevertheless", "new", "next", "nine", "ninety", "no", "no-one", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "notwithstanding", "novel", "now", "nowhere", "o", "obtain", "obtained", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "omitted", "on", "once", "one", "one's", "ones", "only", "onto", "opposite", "or", "ord", "other", "others", "otherwise", "ought", "oughtn't", "our", "ours", "ours", "ourselves", "out", "outside", "over", "overall", "owing", "own", "p", "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provided", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "round", "run", "s", "said", "same", "saw", "say", "saying", "says", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "shan't", "she", "she'd", "she'll", "she's", "shed", "shes", "should", "shouldn't", "show", "showed", "shown", "showns", "shows", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "slightly", "so", "some", "somebody", "someday", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "state", "states", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "system", "t", "t's", "take", "taken", "taking", "tell", "ten", "tends", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "that's", "that've", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "there'd", "there'll", "there're", "there's", "there've", "thereafter", "thereby", "thered", "therefore", "therein", "thereof", "therere", "theres", "thereto", "thereupon", "these", "they", "they'd", "they'll", "they're", "they've", "theyd", "theyre", "thick", "thin", "thing", "things", "think", "third", "thirty", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "til", "till", "tip", "to", "together", "too", "took", "top", "toward", "towards", "tried", "tries", "truly", "try", "trying", "ts", "twelve", "twenty", "twice", "two", "u", "un", "under", "underneath", "undoing", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "up", "upon", "ups", "upwards", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "uucp", "v", "value", "various", "versus", "very", "via", "viz", "vol", "vols", "vs", "w", "want", "wants", "was", "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "wed", "welcome", "well", "went", "were", "weren't", "what", "what'll", "what's", "what're", "what've", "whatever", "whats", "when", "when's", "whence", "whenever", "where", "where's", "whereafter", "whereas", "whereby", "wherein", "wheres", "whereupon", "wherever", "whether", "which", "whichever", "while", "whilst", "whim", "whither", "who", "who'd", "who'll", "who's", "whod", "whoever", "whole", "whom", "whomever", "whos", "whose", "why", "why's", "widely", "will", "willing", "wish", "with", "within", "without", "won't", "wonder", "word", "words", "world", "would", "wouldn't", "www", "x", "y", "yes", "yet", "you", "you'd", "you'll", "you're", "you've", "youd", "your", "youre", "yours", "yourself", "yourselves", "z", "zero"]
+      # N.B. Some English contractions are ambigous (i.e. "she's" can mean "she has" or "she is").
+      # Pragmatic Tokenizer will return the most frequently appearing expanded contraction. Regardless, this should
+      # be rather insignificant as in most cases one is probably removing stop words.
       CONTRACTIONS = {
         "i'm"               => "i am",
         "i'll"              => "i will",
@@ -88,7 +91,6 @@ module PragmaticTokenizer
       }
       class SingleQuotes
         def handle_single_quotes(text)
-          text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
           # Convert left quotes to special character except for 'Twas or 'twas
           text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
           text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text

data/lib/pragmatic_tokenizer/languages/french.rb CHANGED

@@ -9,7 +9,6 @@ module PragmaticTokenizer
       class SingleQuotes
         def handle_single_quotes(text)
           text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
-          text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
           text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
           text.gsub!(/l\'/, '\1 l☮ \2') || text
           text.gsub!(/L\'/, '\1 L☮ \2') || text

data/lib/pragmatic_tokenizer/version.rb CHANGED

@@ -1,3 +1,3 @@
 module PragmaticTokenizer
-  VERSION = "0.4.0"
+  VERSION = "0.4.1"
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pragmatic_tokenizer
 version: !ruby/object:Gem::Version
-  version: 0.4.0
+  version: 0.4.1
 platform: ruby
 authors:
 - Kevin S. Dias
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2016-01-13 00:00:00.000000000 Z
+date: 2016-01-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: unicode