pragmatic_tokenizer 0.1.12 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +49 -1
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/english.rb +4 -4
- data/lib/pragmatic_tokenizer/processor.rb +34 -20
- data/lib/pragmatic_tokenizer/tokenizer.rb +58 -12
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4d4f4efd555a719dda2fd957aa2683a8846140f5
|
4
|
+
data.tar.gz: 7328a978a04c6a7754aab8d5b013d1174744bea5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7f0c65f7c717424af8fcc2e00ede126803ac5a4980e819d18020acb91b04e722f0e44e7be4067aeee0d35f165443ad1dd5534af5fe8f85af2ae63f4f83ece756
|
7
|
+
data.tar.gz: be8292abc8548194d660baa35947e4f5b7fbc365dad518a199f8741224bf1c864b214604b67d53e3265b7bda9f5ea561a1832f726c4dcbd04d40cbd1d8db7370
|
data/README.md
CHANGED
@@ -24,6 +24,7 @@ Or install it yourself as:
|
|
24
24
|
|
25
25
|
* If no language is specified, the library will default to English.
|
26
26
|
* To specify a language use its two character [ISO 639-1 code](https://www.tm-town.com/languages).
|
27
|
+
* Pragmatic Tokenizer will unescape any HTML entities.
|
27
28
|
|
28
29
|
**Options**
|
29
30
|
|
@@ -85,10 +86,19 @@ Or install it yourself as:
|
|
85
86
|
|
86
87
|
<hr>
|
87
88
|
|
89
|
+
##### `downcase`
|
90
|
+
**default** = `'true'`
|
91
|
+
|
92
|
+
<hr>
|
93
|
+
|
88
94
|
##### `minimum_length`
|
89
95
|
**default** = `0`
|
90
96
|
The minimum number of characters a token should be.
|
91
97
|
|
98
|
+
**Methods**
|
99
|
+
|
100
|
+
#### `#tokenize`
|
101
|
+
|
92
102
|
**Example Usage**
|
93
103
|
```ruby
|
94
104
|
text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
|
@@ -131,6 +141,44 @@ PragmaticTokenizer::Tokenizer.new(text, minimum_length: 6).tokenize
|
|
131
141
|
# => ["minimum", "length"]
|
132
142
|
```
|
133
143
|
|
144
|
+
<hr>
|
145
|
+
|
146
|
+
#### `#urls`
|
147
|
+
Extract only valid URL tokens
|
148
|
+
|
149
|
+
*Not Yet Implemented*
|
150
|
+
|
151
|
+
<hr>
|
152
|
+
|
153
|
+
#### `#emails`
|
154
|
+
Extract only valid email tokens
|
155
|
+
|
156
|
+
*Not Yet Implemented*
|
157
|
+
|
158
|
+
<hr>
|
159
|
+
|
160
|
+
#### `#hashtags`
|
161
|
+
Extract only valid hashtag tokens
|
162
|
+
|
163
|
+
*Not Yet Implemented*
|
164
|
+
|
165
|
+
<hr>
|
166
|
+
|
167
|
+
#### `#mentions`
|
168
|
+
Extract only valid @ mention tokens
|
169
|
+
|
170
|
+
<hr>
|
171
|
+
|
172
|
+
#### `#emoticons`
|
173
|
+
Extract only simple emoticon tokens
|
174
|
+
|
175
|
+
<hr>
|
176
|
+
|
177
|
+
#### `#emoji`
|
178
|
+
Extract only valid† emoji tokens
|
179
|
+
|
180
|
+
*†matches all 1012 single-character Unicode Emoji (all except for two-character flags)*
|
181
|
+
|
134
182
|
## Language Support
|
135
183
|
|
136
184
|
The following lists the current level of support for different languages. Pull requests or help for any languages that are not fully supported would be greatly appreciated.
|
@@ -150,7 +198,7 @@ Stop Words: Yes
|
|
150
198
|
Contractions: No
|
151
199
|
|
152
200
|
##### Bulgarian
|
153
|
-
Specs:
|
201
|
+
Specs: More needed
|
154
202
|
Abbreviations: Yes
|
155
203
|
Stop Words: Yes
|
156
204
|
Contractions: No
|
@@ -2,7 +2,7 @@ module PragmaticTokenizer
|
|
2
2
|
module Languages
|
3
3
|
module Bulgarian
|
4
4
|
include Languages::Common
|
5
|
-
ABBREVIATIONS = ['акад', 'ал', 'бел.а', 'б.р', 'Б.р', 'б.ред', 'бел.пр', 'бр', 'бул', 'в', 'вж', 'вкл', 'вм', 'вр', 'ген', 'г', 'гр', 'дм', 'доц', 'др', 'ем', 'заб', 'зам', 'инж', 'кг', 'км', 'лв', 'к.с', 'кв', 'кв.м', 'кор', 'куб', 'куб.м', 'л', 'м', 'м.г', 'мин', 'млрд', 'млн', 'мм', 'напр', 'н.с', 'пл', 'полк', 'проф', 'р', 'рис', 'с', 'св', 'сек', 'см', 'сп', 'срв', 'ст', 'стр', 'т.е', 'т.г', 'т', 'табл', 'т.н', 'т.нар', 'тел', 'У', 'Дж', 'ул', 'фиг', 'хил', 'ха', 'ч', 'чл', 'щ.д'
|
5
|
+
ABBREVIATIONS = ['акад', 'ал', 'бел.а', 'б.р', 'Б.р', 'б.ред', 'бел.пр', 'бр', 'бул', 'в', 'вж', 'вкл', 'вм', 'вр', 'ген', 'г', 'гр', 'дм', 'доц', 'др', 'ем', 'заб', 'зам', 'инж', 'кг', 'км', 'лв', 'к.с', 'кв', 'кв.м', 'кор', 'куб', 'куб.м', 'л', 'м', 'м.г', 'мин', 'млрд', 'млн', 'мм', 'напр', 'н.с', 'пл', 'полк', 'проф', 'р', 'рис', 'с', 'св', 'сек', 'см', 'сп', 'срв', 'ст', 'стр', 'т.е', 'т.г', 'т', 'табл', 'т.н', 'т.нар', 'тел', 'У', 'Дж', 'ул', 'фиг', 'хил', 'ха', 'ч', 'чл', 'щ.д']
|
6
6
|
STOP_WORDS = ["а", "автентичен", "аз", "ако", "ала", "бе", "без", "беше", "би", "бивш", "бивша", "бившо", "бил", "била", "били", "било", "благодаря", "близо", "бъдат", "бъде", "бяха", "в", "вас", "ваш", "ваша", "вероятно", "вече", "взема", "ви", "вие", "винаги", "внимава", "време", "все", "всеки", "всички", "всичко", "всяка", "във", "въпреки", "върху", "г", "ги", "главен", "главна", "главно", "глас", "го", "година", "години", "годишен", "д", "да", "дали", "два", "двама", "двамата", "две", "двете", "ден", "днес", "дни", "до", "добра", "добре", "добро", "добър", "докато", "докога", "дори", "досега", "доста", "друг", "друга", "други", "е", "евтин", "едва", "един", "една", "еднаква", "еднакви", "еднакъв", "едно", "екип", "ето", "живот", "за", "забавям", "зад", "заедно", "заради", "засега", "заспал", "затова", "защо", "защото", "и", "из", "или", "им", "има", "имат", "иска", "й", "каза", "как", "каква", "какво", "както", "какъв", "като", "кога", "когато", "което", "които", "кой", "който", "колко", "която", "къде", "където", "към", "лесен", "лесно", "ли", "лош", "м", "май", "малко", "ме", "между", "мек", "мен", "месец", "ми", "много", "мнозина", "мога", "могат", "може", "мокър", "моля", "момента", "му", "н", "на", "над", "назад", "най", "направи", "напред", "например", "нас", "не", "него", "нещо", "нея", "ни", "ние", "никой", "нито", "нищо", "но", "нов", "нова", "нови", "новина", "някои", "някой", "няколко", "няма", "обаче", "около", "освен", "особено", "от", "отгоре", "отново", "още", "пак", "по", "повече", "повечето", "под", "поне", "поради", "после", "почти", "прави", "пред", "преди", "през", "при", "пък", "първата", "първи", "първо", "пъти", "равен", "равна", "с", "са", "сам", "само", "се", "сега", "си", "син", "скоро", "след", "следващ", "сме", "смях", "според", "сред", "срещу", "сте", "съм", "със", "също", "т", "тази", "така", "такива", "такъв", "там", "твой", "те", "тези", "ти", "т.н.", "то", "това", "тогава", "този", "той", "толкова", "точно", "три", "трябва", "тук", "тъй", "тя", "тях", "у", "утре", "харесва", "хиляди", "ч", "часа", "че", "често", "чрез", "ще", "щом", "юмрук", "я", "як"]
|
7
7
|
CONTRACTIONS = {}
|
8
8
|
end
|
@@ -3,7 +3,7 @@ module PragmaticTokenizer
|
|
3
3
|
module Deutsch
|
4
4
|
include Languages::Common
|
5
5
|
ABBREVIATIONS = ["adj", "adm", "adv", "art", "art", "asst", "b.a", "b.s", "bart", "bldg", "brig", "bros", "bse", "buchst", "bzgl", "bzw", "c.-à-d", "ca", "ca", "capt", "chr", "cmdr", "co", "col", "comdr", "con", "corp", "cpl", "d.h", "d.j", "dergl", "dgl", "dkr", "dr ", "ens", "etc", "ev ", "evtl", "ff", "g.g.a", "g.u", "gen", "ggf", "gov", "hon", "hosp", "i.f", "i.h.v", "ii", "iii", "insp", "iv", "ix", "jun", "k.o", "kath ", "lfd", "lt", "ltd", "m.e", "maj", "med", "messrs", "mio", "mlle", "mm", "mme", "mr", "mrd", "mrs", "ms", "msgr", "mwst", "no", "no", "nos", "nos", "nr", "nr", "o.ä", "op", "ord", "pfc", "ph", "pp", "pp", "prof", "pvt", "rep", "reps", "res", "rev", "rt", "s.p.a", "sa", "sen", "sens", "sfc", "sgt", "sog", "sogen", "spp", "sr", "st", "std", "str ", "supt", "surg", "u.a ", "u.e", "u.s.w", "u.u", "u.ä", "usf", "usw", "v", "vgl", "vi", "vii", "viii", "vs", "x", "xi", "xii", "xiii", "xiv", "xix", "xv", "xvi", "xvii", "xviii", "xx", "z.b", "z.t", "z.z", "z.zt", "zt", "zzt", "Ä", "ä"]
|
6
|
-
STOP_WORDS = ["a", "ab", "aber", "ach", "acht", "achte", "achten", "achter", "achtes", "ag", "alle", "allein", "allem", "allen", "aller", "allerdings", "alles", "allgemeinen", "als", "also", "am", "an", "andere", "anderen", "andern", "anders", "au", "auch", "auf", "aus", "ausser", "ausserdem", "außer", "außerdem", "b", "bald", "bei", "beide", "beiden", "beim", "beispiel", "bekannt", "bereits", "besonders", "besser", "besten", "bin", "bis", "bisher", "bist", "c", "d", "d.h", "da", "dabei", "dadurch", "dafür", "dagegen", "daher", "dahin", "dahinter", "damals", "damit", "danach", "daneben", "dank", "dann", "daran", "darauf", "daraus", "darf", "darfst", "darin", "darum", "darunter", "darüber", "das", "dasein", "daselbst", "dass", "dasselbe", "davon", "davor", "dazu", "dazwischen", "daß", "dein", "deine", "deinem", "deiner", "dem", "dementsprechend", "demgegenüber", "demgemäss", "demgemäß", "demselben", "demzufolge", "den", "denen", "denn", "denselben", "der", "deren", "derjenige", "derjenigen", "dermassen", "dermaßen", "derselbe", "derselben", "des", "deshalb", "desselben", "dessen", "deswegen", "dich", "die", "diejenige", "diejenigen", "dies", "diese", "dieselbe", "dieselben", "diesem", "diesen", "dieser", "dieses", "dir", "doch", "dort", "drei", "drin", "dritte", "dritten", "dritter", "drittes", "du", "durch", "durchaus", "durfte", "durften", "dürfen", "dürft", "e", "eben", "ebenso", "ehrlich", "ei", "ei,", "eigen", "eigene", "eigenen", "eigener", "eigenes", "ein", "einander", "eine", "einem", "einen", "einer", "eines", "einige", "einigen", "einiger", "einiges", "einmal", "eins", "elf", "en", "ende", "endlich", "entweder", "er", "erst", "erste", "ersten", "erster", "erstes", "es", "etwa", "etwas", "euch", "euer", "eure", "f", "früher", "fünf", "fünfte", "fünften", "fünfter", "fünftes", "für", "g", "gab", "ganz", "ganze", "ganzen", "ganzer", "ganzes", "gar", "gedurft", "gegen", "gegenüber", "gehabt", "gehen", "geht", "gekannt", "gekonnt", "gemacht", "gemocht", "gemusst", "genug", "gerade", "gern", "gesagt", "geschweige", "gewesen", "gewollt", "geworden", "gibt", "ging", "gleich", "gott", "gross", "grosse", "grossen", "grosser", "grosses", "groß", "große", "großen", "großer", "großes", "gut", "gute", "guter", "gutes", "h", "habe", "haben", "habt", "hast", "hat", "hatte", "hatten", "hattest", "hattet", "heisst", "her", "heute", "hier", "hin", "hinter", "hoch", "hätte", "hätten", "i", "ich", "ihm", "ihn", "ihnen", "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres", "im", "immer", "in", "indem", "infolgedessen", "ins", "irgend", "ist", "j", "ja", "jahr", "jahre", "jahren", "je", "jede", "jedem", "jeden", "jeder", "jedermann", "jedermanns", "jedes", "jedoch", "jemand", "jemandem", "jemanden", "jene", "jenem", "jenen", "jener", "jenes", "jetzt", "k", "kam", "kann", "kannst", "kaum", "kein", "keine", "keinem", "keinen", "keiner", "kleine", "kleinen", "kleiner", "kleines", "kommen", "kommt", "konnte", "konnten", "kurz", "können", "könnt", "könnte", "l", "lang", "lange", "leicht", "leide", "lieber", "los", "m", "machen", "macht", "machte", "mag", "magst", "mahn", "man", "manche", "manchem", "manchen", "mancher", "manches", "mann", "mehr", "mein", "meine", "meinem", "meinen", "meiner", "meines", "mensch", "menschen", "mich", "mir", "mit", "mittel", "mochte", "mochten", "morgen", "muss", "musst", "musste", "mussten", "muß", "mußt", "möchte", "mögen", "möglich", "mögt", "müssen", "müsst", "müßt", "n", "na", "nach", "nachdem", "nahm", "natürlich", "neben", "nein", "neue", "neuen", "neun", "neunte", "neunten", "neunter", "neuntes", "nicht", "nichts", "nie", "niemand", "niemandem", "niemanden", "noch", "nun", "nur", "o", "ob", "oben", "oder", "offen", "oft", "ohne", "p", "q", "r", "recht", "rechte", "rechten", "rechter", "rechtes", "richtig", "rund", "s", "sa", "sache", "sagt", "sagte", "sah", "satt", "schlecht", "schon", "sechs", "sechste", "sechsten", "sechster", "sechstes", "sehr", "sei", "seid", "seien", "sein", "seine", "seinem", "seinen", "seiner", "seines", "seit", "seitdem", "selbst", "sich", "sie", "sieben", "siebente", "siebenten", "siebenter", "siebentes", "sind", "so", "solang", "solche", "solchem", "solchen", "solcher", "solches", "soll", "sollen", "sollst", "sollt", "sollte", "sollten", "sondern", "sonst", "soweit", "sowie", "später", "statt", "t", "tag", "tage", "tagen", "tat", "teil", "tel", "tritt", "trotzdem", "tun", "u", "uhr", "um", "und", "und?", "uns", "unser", "unsere", "unserer", "unter", "v", "vergangenen", "viel", "viele", "vielem", "vielen", "vielleicht", "vier", "vierte", "vierten", "vierter", "viertes", "vom", "von", "vor", "w", "wahr?", "wann", "war", "waren", "wart", "warum", "was", "wegen", "weil", "weit", "weiter", "weitere", "weiteren", "weiteres", "welche", "welchem", "welchen", "welcher", "welches", "wem", "wen", "wenig", "wenige", "weniger", "weniges", "wenigstens", "wenn", "wer", "werde", "werden", "werdet", "weshalb", "wessen", "wie", "wieder", "wieso", "will", "willst", "wir", "wird", "wirklich", "wirst", "wo", "woher", "wohin", "wohl", "wollen", "wollt", "wollte", "wollten", "worden", "wurde", "wurden", "während", "währenddem", "währenddessen", "wäre", "würde", "würden", "x", "y", "z", "z.b", "zehn", "zehnte", "zehnten", "zehnter", "zehntes", "zeit", "zu", "zuerst", "zugleich", "zum", "zunächst", "zur", "zurück", "zusammen", "zwanzig", "zwar", "zwei", "zweite", "zweiten", "zweiter", "zweites", "zwischen", "zwölf", "über", "überhaupt", "übrigens"]
|
6
|
+
STOP_WORDS = ["a", "ab", "aber", "ach", "acht", "achte", "achten", "achter", "achtes", "ag", "alle", "allein", "allem", "allen", "aller", "allerdings", "alles", "allgemeinen", "als", "also", "am", "an", "andere", "anderen", "andern", "anders", "au", "auch", "auf", "aus", "ausser", "ausserdem", "außer", "außerdem", "b", "bald", "bei", "beide", "beiden", "beim", "beispiel", "bekannt", "bereits", "besonders", "besser", "besten", "bin", "bis", "bisher", "bist", "c", "d", "d.h", "da", "dabei", "dadurch", "dafür", "dagegen", "daher", "dahin", "dahinter", "damals", "damit", "danach", "daneben", "dank", "dann", "daran", "darauf", "daraus", "darf", "darfst", "darin", "darum", "darunter", "darüber", "das", "dasein", "daselbst", "dass", "dasselbe", "davon", "davor", "dazu", "dazwischen", "daß", "dein", "deine", "deinem", "deiner", "dem", "dementsprechend", "demgegenüber", "demgemäss", "demgemäß", "demselben", "demzufolge", "den", "denen", "denn", "denselben", "der", "deren", "derjenige", "derjenigen", "dermassen", "dermaßen", "derselbe", "derselben", "des", "deshalb", "desselben", "dessen", "deswegen", "dich", "die", "diejenige", "diejenigen", "dies", "diese", "dieselbe", "dieselben", "diesem", "diesen", "dieser", "dieses", "dir", "doch", "dort", "drei", "drin", "dritte", "dritten", "dritter", "drittes", "du", "durch", "durchaus", "durfte", "durften", "dürfen", "dürft", "e", "eben", "ebenso", "ehrlich", "ei", "ei,", "eigen", "eigene", "eigenen", "eigener", "eigenes", "ein", "einander", "eine", "einem", "einen", "einer", "eines", "einige", "einigen", "einiger", "einiges", "einmal", "eins", "elf", "en", "ende", "endlich", "entweder", "er", "erst", "erste", "ersten", "erster", "erstes", "es", "etwa", "etwas", "euch", "euer", "eure", "f", "früher", "fünf", "fünfte", "fünften", "fünfter", "fünftes", "für", "g", "gab", "ganz", "ganze", "ganzen", "ganzer", "ganzes", "gar", "gedurft", "gegen", "gegenüber", "gehabt", "gehen", "geht", "gekannt", "gekonnt", "gemacht", "gemocht", "gemusst", "genug", "gerade", "gern", "gesagt", "geschweige", "gewesen", "gewollt", "geworden", "gibt", "ging", "gleich", "gott", "gross", "grosse", "grossen", "grosser", "grosses", "groß", "große", "großen", "großer", "großes", "gut", "gute", "guter", "gutes", "h", "habe", "haben", "habt", "hast", "hat", "hatte", "hatten", "hattest", "hattet", "heisst", "her", "heute", "hier", "hin", "hinter", "hoch", "hätte", "hätten", "i", "ich", "ihm", "ihn", "ihnen", "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres", "im", "immer", "in", "indem", "infolgedessen", "ins", "irgend", "ist", "j", "ja", "jahr", "jahre", "jahren", "je", "jede", "jedem", "jeden", "jeder", "jedermann", "jedermanns", "jedes", "jedoch", "jemand", "jemandem", "jemanden", "jene", "jenem", "jenen", "jener", "jenes", "jetzt", "k", "kam", "kann", "kannst", "kaum", "kein", "keine", "keinem", "keinen", "keiner", "kleine", "kleinen", "kleiner", "kleines", "km", "kommen", "kommt", "konnte", "konnten", "kurz", "können", "könnt", "könnte", "l", "lang", "lange", "leicht", "leide", "lieber", "los", "m", "machen", "macht", "machte", "mag", "magst", "mahn", "man", "manche", "manchem", "manchen", "mancher", "manches", "mann", "mehr", "mein", "meine", "meinem", "meinen", "meiner", "meines", "mensch", "menschen", "mich", "mir", "mit", "mittel", "mochte", "mochten", "morgen", "muss", "musst", "musste", "mussten", "muß", "mußt", "möchte", "mögen", "möglich", "mögt", "müssen", "müsst", "müßt", "n", "na", "nach", "nachdem", "nahm", "natürlich", "neben", "nein", "neue", "neuen", "neun", "neunte", "neunten", "neunter", "neuntes", "nicht", "nichts", "nie", "niemand", "niemandem", "niemanden", "noch", "nun", "nur", "o", "ob", "oben", "oder", "offen", "oft", "ohne", "p", "q", "r", "recht", "rechte", "rechten", "rechter", "rechtes", "richtig", "rund", "s", "sa", "sache", "sagt", "sagte", "sah", "satt", "schlecht", "schon", "sechs", "sechste", "sechsten", "sechster", "sechstes", "sehr", "sei", "seid", "seien", "sein", "seine", "seinem", "seinen", "seiner", "seines", "seit", "seitdem", "selbst", "sich", "sie", "sieben", "siebente", "siebenten", "siebenter", "siebentes", "sind", "so", "solang", "solche", "solchem", "solchen", "solcher", "solches", "soll", "sollen", "sollst", "sollt", "sollte", "sollten", "sondern", "sonst", "soweit", "sowie", "später", "statt", "t", "tag", "tage", "tagen", "tat", "teil", "tel", "tritt", "trotzdem", "tun", "u", "uhr", "um", "und", "und?", "uns", "unser", "unsere", "unserer", "unter", "v", "vergangenen", "viel", "viele", "vielem", "vielen", "vielleicht", "vier", "vierte", "vierten", "vierter", "viertes", "vom", "von", "vor", "w", "wahr?", "wann", "war", "waren", "wart", "warum", "was", "wegen", "weil", "weit", "weiter", "weitere", "weiteren", "weiteres", "welche", "welchem", "welchen", "welcher", "welches", "wem", "wen", "wenig", "wenige", "weniger", "weniges", "wenigstens", "wenn", "wer", "werde", "werden", "werdet", "weshalb", "wessen", "wie", "wieder", "wieso", "will", "willst", "wir", "wird", "wirklich", "wirst", "wo", "woher", "wohin", "wohl", "wollen", "wollt", "wollte", "wollten", "worden", "wurde", "wurden", "während", "währenddem", "währenddessen", "wäre", "würde", "würden", "x", "y", "z", "z.b", "zehn", "zehnte", "zehnten", "zehnter", "zehntes", "zeit", "zu", "zuerst", "zugleich", "zum", "zunächst", "zur", "zurück", "zusammen", "zwanzig", "zwar", "zwei", "zweite", "zweiten", "zweiter", "zweites", "zwischen", "zwölf", "über", "überhaupt", "übrigens"]
|
7
7
|
CONTRACTIONS = {}
|
8
8
|
end
|
9
9
|
end
|
@@ -5,10 +5,10 @@ module PragmaticTokenizer
|
|
5
5
|
ABBREVIATIONS = ["adj", "adm", "adv", "al", "ala", "alta", "apr", "arc", "ariz", "ark", "art", "assn", "asst", "attys", "aug", "ave", "bart", "bld", "bldg", "blvd", "brig", "bros", "btw", "cal", "calif", "capt", "cl", "cmdr", "co", "col", "colo", "comdr", "con", "conn", "corp", "cpl", "cres", "ct", "d.phil", "dak", "dec", "del", "dept", "det", "dist", "dr", "dr.phil", "dr.philos", "drs", "e.g", "ens", "esp", "esq", "etc", "exp", "expy", "ext", "feb", "fed", "fla", "ft", "fwy", "fy", "ga", "gen", "gov", "hon", "hosp", "hr", "hway", "hwy", "i.e", "ia", "id", "ida", "ill", "inc", "ind", "ing", "insp", "is", "jan", "jr", "jul", "jun", "kan", "kans", "ken", "ky", "la", "lt", "ltd", "maj", "man", "mar", "mass", "may", "md", "me", "med", "messrs", "mex", "mfg", "mich", "min", "minn", "miss", "mlle", "mm", "mme", "mo", "mont", "mr", "mrs", "ms", "msgr", "mssrs", "mt", "mtn", "neb", "nebr", "nev", "no", "nos", "nov", "nr", "oct", "ok", "okla", "ont", "op", "ord", "ore", "p", "pa", "pd", "pde", "penn", "penna", "pfc", "ph", "ph.d", "pl", "plz", "pp", "prof", "pvt", "que", "rd", "ref", "rep", "reps", "res", "rev", "rt", "sask", "sec", "sen", "sens", "sep", "sept", "sfc", "sgt", "sr", "st", "supt", "surg", "tce", "tenn", "tex", "u.s", "univ", "usafa", "ut", "v", "va", "ver", "vs", "vt", "wash", "wis", "wisc", "wy", "wyo", "yuk"]
|
6
6
|
STOP_WORDS = ["&#;f", "'ll", "'ve", "+//", "-/+", "</li>", "</p>", "</td>", "<br", "<br/>", "<br/><br/>", "<li>", "<p>", "<sup></sup>", "<sup></sup></li>", "<td", "<td>", "___", "____", "_____", "______", "_______", "________", "_________", "__________", "___________", "____________", "_____________", "______________", "a", "a's", "able", "about", "above", "abroad", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "adopted", "affected", "affecting", "affects", "after", "afterwards", "again", "against", "ago", "ah", "ahead", "ain't", "all", "allow", "allows", "almost", "alone", "along", "alongside", "already", "also", "although", "always", "am", "amid", "amidst", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "are", "aren", "aren't", "arent", "arise", "around", "as", "aside", "ask", "asking", "associated", "at", "auth", "available", "away", "awfully", "b", "back", "backward", "backwards", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bill", "biol", "both", "bottom", "brief", "briefly", "but", "by", "c", "c'mon", "c's", "ca", "call", "came", "can", "can't", "cannot", "cant", "caption", "cause", "causes", "certain", "certainly", "changes", "class=", "clearly", "co", "co.", "com", "come", "comes", "computer", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn't", "couldnt", "course", "cry", "currently", "d", "dare", "daren't", "date", "de", "definitely", "describe", "described", "despite", "detail", "did", "didn't", "different", "directly", "do", "does", "doesn't", "doing", "don't", "done", "down", "downwards", "due", "during", "e", "each", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "eleven", "else", "elsewhere", "empty", "end", "ending", "enough", "entirely", "especially", "et", "et-al", "etc", "even", "ever", "evermore", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "f", "fairly", "far", "farther", "few", "fewer", "ff", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "followed", "following", "follows", "for", "forever", "former", "formerly", "forth", "forty", "forward", "found", "four", "from", "front", "full", "further", "furthermore", "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't", "half", "happens", "hardly", "has", "hasn't", "hasnt", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "hed", "hello", "help", "hence", "her", "here", "here's", "hereafter", "hereby", "herein", "heres", "hereupon", "hers", "herself", "hes", "hi", "hid", "him", "himself", "his", "hither", "home", "hopefully", "how", "how's", "howbeit", "however", "http", "https", "hundred", "i", "i'd", "i'll", "i'm", "i've", "id", "ie", "if", "ignored", "im", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "inc.", "indeed", "index", "indicate", "indicated", "indicates", "information", "ing", "inner", "inside", "insofar", "instead", "interest", "into", "invention", "inward", "is", "isn't", "it", "it'd", "it'll", "it's", "itd", "its", "itself", "j", "just", "k", "keep", "keeps", "kept", "keys", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "let's", "lets", "like", "liked", "likely", "likewise", "line", "little", "look", "looking", "looks", "low", "lower", "ltd", "m", "made", "mainly", "make", "makes", "many", "may", "maybe", "mayn't", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn't", "mill", "million", "mine", "minus", "miss", "ml", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "much", "mug", "must", "mustn't", "my", "myself", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needn't", "needs", "neither", "never", "neverf", "neverless", "nevertheless", "new", "next", "nine", "ninety", "no", "no-one", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "notwithstanding", "novel", "now", "nowhere", "o", "obtain", "obtained", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "omitted", "on", "once", "one", "one's", "ones", "only", "onto", "opposite", "or", "ord", "other", "others", "otherwise", "ought", "oughtn't", "our", "ours", "ours", "ourselves", "out", "outside", "over", "overall", "owing", "own", "p", "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provided", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "round", "run", "s", "said", "same", "saw", "say", "saying", "says", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "shan't", "she", "she'd", "she'll", "she's", "shed", "shes", "should", "shouldn't", "show", "showed", "shown", "showns", "shows", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "slightly", "so", "some", "somebody", "someday", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "state", "states", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "system", "t", "t's", "take", "taken", "taking", "tell", "ten", "tends", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "that's", "that've", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "there'd", "there'll", "there're", "there's", "there've", "thereafter", "thereby", "thered", "therefore", "therein", "thereof", "therere", "theres", "thereto", "thereupon", "these", "they", "they'd", "they'll", "they're", "they've", "theyd", "theyre", "thick", "thin", "thing", "things", "think", "third", "thirty", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "til", "till", "tip", "to", "together", "too", "took", "top", "toward", "towards", "tried", "tries", "truly", "try", "trying", "ts", "twelve", "twenty", "twice", "two", "u", "un", "under", "underneath", "undoing", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "up", "upon", "ups", "upwards", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "uucp", "v", "value", "various", "versus", "very", "via", "viz", "vol", "vols", "vs", "w", "want", "wants", "was", "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "wed", "welcome", "well", "went", "were", "weren't", "what", "what'll", "what's", "what're", "what've", "whatever", "whats", "when", "when's", "whence", "whenever", "where", "where's", "whereafter", "whereas", "whereby", "wherein", "wheres", "whereupon", "wherever", "whether", "which", "whichever", "while", "whilst", "whim", "whither", "who", "who'd", "who'll", "who's", "whod", "whoever", "whole", "whom", "whomever", "whos", "whose", "why", "why's", "widely", "will", "willing", "wish", "with", "within", "without", "won't", "wonder", "word", "words", "world", "would", "wouldn't", "www", "x", "y", "yes", "yet", "you", "you'd", "you'll", "you're", "you've", "youd", "your", "youre", "yours", "yourself", "yourselves", "z", "zero"]
|
7
7
|
CONTRACTIONS = {
|
8
|
-
"i'm" => "
|
9
|
-
"i'll" => "
|
10
|
-
"i'd" => "
|
11
|
-
"i've" => "
|
8
|
+
"i'm" => "i am",
|
9
|
+
"i'll" => "i will",
|
10
|
+
"i'd" => "i would",
|
11
|
+
"i've" => "i have",
|
12
12
|
"you're" => "you are",
|
13
13
|
"you'll" => "you will",
|
14
14
|
"you'd" => "you would",
|
@@ -20,7 +20,9 @@ module PragmaticTokenizer
|
|
20
20
|
shift_at_symbol(text)
|
21
21
|
convert_dbl_quotes(text)
|
22
22
|
convert_sgl_quotes(text)
|
23
|
-
|
23
|
+
shift_beginning_hyphen(text)
|
24
|
+
shift_ending_hyphen(text)
|
25
|
+
tokens = separate_full_stop(text.squeeze(' ').split.map { |t| convert_sym_to_punct(t) })
|
24
26
|
separate_other_ending_punc(tokens)
|
25
27
|
end
|
26
28
|
|
@@ -58,6 +60,14 @@ module PragmaticTokenizer
|
|
58
60
|
text.gsub!(/¡/, ' ¡ ') || text
|
59
61
|
end
|
60
62
|
|
63
|
+
def shift_ending_hyphen(text)
|
64
|
+
text.gsub!(/-\s+/, ' - ') || text
|
65
|
+
end
|
66
|
+
|
67
|
+
def shift_beginning_hyphen(text)
|
68
|
+
text.gsub!(/\s+-/, ' - ') || text
|
69
|
+
end
|
70
|
+
|
61
71
|
def shift_special_quotes(text)
|
62
72
|
text.gsub!(/«/, ' « ') || text
|
63
73
|
text.gsub!(/»/, ' » ') || text
|
@@ -104,28 +114,32 @@ module PragmaticTokenizer
|
|
104
114
|
end
|
105
115
|
|
106
116
|
def separate_full_stop(tokens)
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
w =~
|
117
|
-
|
118
|
-
|
119
|
-
|
117
|
+
if @language.eql?(Languages::English) || @language.eql?(Languages::Common)
|
118
|
+
abbr = {}
|
119
|
+
@language::ABBREVIATIONS.each do |i|
|
120
|
+
abbr[i] = true
|
121
|
+
end
|
122
|
+
cleaned_tokens = []
|
123
|
+
tokens.each_with_index do |_t, i|
|
124
|
+
if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/
|
125
|
+
w = $1
|
126
|
+
unless abbr[w.downcase] || w =~ /\A[a-z]\z/i ||
|
127
|
+
w =~ /[a-z](?:\.[a-z])+\z/i
|
128
|
+
cleaned_tokens << w
|
129
|
+
cleaned_tokens << '.'
|
130
|
+
next
|
131
|
+
end
|
120
132
|
end
|
133
|
+
cleaned_tokens << tokens[i]
|
121
134
|
end
|
122
|
-
cleaned_tokens
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
cleaned_tokens
|
135
|
+
if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/
|
136
|
+
cleaned_tokens[-1] = $1
|
137
|
+
cleaned_tokens.push '.'
|
138
|
+
end
|
139
|
+
cleaned_tokens
|
140
|
+
else
|
141
|
+
tokens.flat_map { |t| t =~ /\.\z/ && !@language::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0])) && t.length > 2 ? t.split(".").flatten + ["."] : t }
|
127
142
|
end
|
128
|
-
cleaned_tokens
|
129
143
|
end
|
130
144
|
|
131
145
|
def separate_other_ending_punc(tokens)
|
@@ -1,11 +1,12 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
2
|
require 'pragmatic_tokenizer/languages'
|
3
|
+
require 'unicode'
|
3
4
|
|
4
5
|
module PragmaticTokenizer
|
5
6
|
class Tokenizer
|
6
7
|
|
7
|
-
attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals
|
8
|
-
def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false)
|
8
|
+
attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase
|
9
|
+
def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true)
|
9
10
|
unless punctuation.eql?('all') ||
|
10
11
|
punctuation.eql?('semi') ||
|
11
12
|
punctuation.eql?('none') ||
|
@@ -24,7 +25,7 @@ module PragmaticTokenizer
|
|
24
25
|
# Punctuation 'only': Removes everything except punctuation. The
|
25
26
|
# returned result is an array of only the punctuation.
|
26
27
|
end
|
27
|
-
@text = text
|
28
|
+
@text = CGI.unescapeHTML(text)
|
28
29
|
@language = language
|
29
30
|
@language_module = Languages.get_language_by_code(language)
|
30
31
|
@punctuation = punctuation
|
@@ -34,11 +35,37 @@ module PragmaticTokenizer
|
|
34
35
|
@remove_numbers = remove_numbers
|
35
36
|
@minimum_length = minimum_length
|
36
37
|
@remove_roman_numerals = remove_roman_numerals
|
38
|
+
@downcase = downcase
|
37
39
|
end
|
38
40
|
|
39
41
|
def tokenize
|
40
42
|
return [] unless text
|
41
|
-
cleaner(remove_short_tokens(delete_numbers(delete_roman_numerals(find_contractions(delete_stop_words(remove_punctuation(processor.new(language: language_module).process(text: text)))))))).reject { |t| t.empty? }
|
43
|
+
downcase_tokens(cleaner(remove_short_tokens(delete_numbers(delete_roman_numerals(find_contractions(delete_stop_words(remove_punctuation(processor.new(language: language_module).process(text: text))))))))).reject { |t| t.empty? }
|
44
|
+
end
|
45
|
+
|
46
|
+
def urls
|
47
|
+
[]
|
48
|
+
end
|
49
|
+
|
50
|
+
def emails
|
51
|
+
[]
|
52
|
+
end
|
53
|
+
|
54
|
+
def hashtags
|
55
|
+
[]
|
56
|
+
end
|
57
|
+
|
58
|
+
def mentions
|
59
|
+
text.split(' ').delete_if { |t| t !~ /\A(@|@)/ }
|
60
|
+
end
|
61
|
+
|
62
|
+
def emoticons
|
63
|
+
text.scan(/(?::|;|=)(?:-)?(?:\)|D|P)/)
|
64
|
+
end
|
65
|
+
|
66
|
+
def emoji
|
67
|
+
# https://github.com/franklsf95/ruby-emoji-regex
|
68
|
+
text.scan(/[\u{203C}\u{2049}\u{20E3}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{23E9}-\u{23EC}\u{23F0}\u{23F3}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2601}\u{260E}\u{2611}\u{2614}-\u{2615}\u{261D}\u{263A}\u{2648}-\u{2653}\u{2660}\u{2663}\u{2665}-\u{2666}\u{2668}\u{267B}\u{267F}\u{2693}\u{26A0}-\u{26A1}\u{26AA}-\u{26AB}\u{26BD}-\u{26BE}\u{26C4}-\u{26C5}\u{26CE}\u{26D4}\u{26EA}\u{26F2}-\u{26F3}\u{26F5}\u{26FA}\u{26FD}\u{2702}\u{2705}\u{2708}-\u{270C}\u{270F}\u{2712}\u{2714}\u{2716}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2764}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F004}\u{1F0CF}\u{1F170}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1E7}-\u{1F1EC}\u{1F1EE}-\u{1F1F0}\u{1F1F3}\u{1F1F5}\u{1F1F7}-\u{1F1FA}\u{1F201}-\u{1F202}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F250}-\u{1F251}\u{1F300}-\u{1F320}\u{1F330}-\u{1F335}\u{1F337}-\u{1F37C}\u{1F380}-\u{1F393}\u{1F3A0}-\u{1F3C4}\u{1F3C6}-\u{1F3CA}\u{1F3E0}-\u{1F3F0}\u{1F400}-\u{1F43E}\u{1F440}\u{1F442}-\u{1F4F7}\u{1F4F9}-\u{1F4FC}\u{1F500}-\u{1F507}\u{1F509}-\u{1F53D}\u{1F550}-\u{1F567}\u{1F5FB}-\u{1F640}\u{1F645}-\u{1F64F}\u{1F680}-\u{1F68A}]/)
|
42
69
|
end
|
43
70
|
|
44
71
|
private
|
@@ -49,6 +76,15 @@ module PragmaticTokenizer
|
|
49
76
|
Processor
|
50
77
|
end
|
51
78
|
|
79
|
+
def downcase_tokens(tokens)
|
80
|
+
return tokens unless downcase
|
81
|
+
if language.eql?('en')
|
82
|
+
tokens.map { |t| t.downcase }
|
83
|
+
else
|
84
|
+
tokens.map { |t| Unicode::downcase(t) }
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
52
88
|
def remove_short_tokens(tokens)
|
53
89
|
tokens.delete_if { |t| t.length < minimum_length }
|
54
90
|
end
|
@@ -60,7 +96,7 @@ module PragmaticTokenizer
|
|
60
96
|
|
61
97
|
def delete_roman_numerals(tokens)
|
62
98
|
return tokens unless remove_roman_numerals
|
63
|
-
tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(t) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{t}.") } if remove_roman_numerals
|
99
|
+
tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(t.downcase) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{t.downcase}.") } if remove_roman_numerals
|
64
100
|
end
|
65
101
|
|
66
102
|
def cleaner(tokens)
|
@@ -87,21 +123,31 @@ module PragmaticTokenizer
|
|
87
123
|
end
|
88
124
|
|
89
125
|
def only_punctuation(tokens)
|
90
|
-
tokens.delete_if
|
91
|
-
t.squeeze!
|
92
|
-
true unless PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t)
|
93
|
-
end
|
126
|
+
tokens.delete_if { |t| !PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
|
94
127
|
end
|
95
128
|
|
96
129
|
def delete_stop_words(tokens)
|
97
130
|
return tokens unless remove_stop_words && language_module::STOP_WORDS
|
98
|
-
|
131
|
+
if downcase
|
132
|
+
if language.eql?('en')
|
133
|
+
tokens.map { |t| t.downcase } - language_module::STOP_WORDS
|
134
|
+
else
|
135
|
+
tokens.map { |t| Unicode::downcase(t) } - language_module::STOP_WORDS
|
136
|
+
end
|
137
|
+
else
|
138
|
+
tokens.delete_if { |t| language_module::STOP_WORDS.include?(t.downcase) }
|
139
|
+
end
|
99
140
|
end
|
100
141
|
|
101
142
|
def find_contractions(tokens)
|
102
143
|
return tokens unless expand_contractions && language_module::CONTRACTIONS
|
103
|
-
|
104
|
-
.flat_map { |t|
|
144
|
+
if downcase
|
145
|
+
tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(t.downcase) ? language_module::CONTRACTIONS[t.downcase].split(' ').flatten : t }
|
146
|
+
.flat_map { |t| t.include?("/") ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
|
147
|
+
else
|
148
|
+
tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(t.downcase) ? language_module::CONTRACTIONS[t.downcase].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
|
149
|
+
.flat_map { |t| t.include?("/") ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
|
150
|
+
end
|
105
151
|
end
|
106
152
|
end
|
107
153
|
end
|
data/pragmatic_tokenizer.gemspec
CHANGED
@@ -18,6 +18,7 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
+
spec.add_runtime_dependency "unicode"
|
21
22
|
spec.add_development_dependency "bundler", "~> 1.9"
|
22
23
|
spec.add_development_dependency "rake", "~> 10.0"
|
23
24
|
spec.add_development_dependency "rspec"
|
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: unicode
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: bundler
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|