stopwords-filter 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +7 -0
- data/Gemfile.lock +29 -0
- data/LICENSE.txt +20 -0
- data/README.md +113 -0
- data/VERSION +1 -0
- data/lib/stopwords.rb +4 -0
- data/lib/stopwords/filter.rb +13 -0
- data/lib/stopwords/snowball.rb +3 -0
- data/lib/stopwords/snowball/filter.rb +19 -0
- data/lib/stopwords/snowball/locales/da.csv +1 -0
- data/lib/stopwords/snowball/locales/de.csv +1 -0
- data/lib/stopwords/snowball/locales/en.csv +1 -0
- data/lib/stopwords/snowball/locales/es.csv +1 -0
- data/lib/stopwords/snowball/locales/fn.csv +1 -0
- data/lib/stopwords/snowball/locales/fr.csv +1 -0
- data/lib/stopwords/snowball/locales/hu.csv +1 -0
- data/lib/stopwords/snowball/locales/it.csv +1 -0
- data/lib/stopwords/snowball/locales/nl.csv +1 -0
- data/lib/stopwords/snowball/locales/pt.csv +1 -0
- data/lib/stopwords/snowball/locales/ru.csv +159 -0
- data/lib/stopwords/snowball/locales/sv.csv +114 -0
- data/spec/lib/filter_spec.rb +17 -0
- data/spec/lib/snowball_filter_spec.rb +19 -0
- data/spec/spec_helper.rb +1 -0
- metadata +78 -0
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
GEM
|
2
|
+
remote: https://rubygems.org/
|
3
|
+
specs:
|
4
|
+
diff-lcs (1.1.3)
|
5
|
+
git (1.2.5)
|
6
|
+
jeweler (1.8.4)
|
7
|
+
bundler (~> 1.0)
|
8
|
+
git (>= 1.2.5)
|
9
|
+
rake
|
10
|
+
rdoc
|
11
|
+
json (1.7.4)
|
12
|
+
rake (0.9.2.2)
|
13
|
+
rdoc (3.12)
|
14
|
+
json (~> 1.4)
|
15
|
+
rspec (2.11.0)
|
16
|
+
rspec-core (~> 2.11.0)
|
17
|
+
rspec-expectations (~> 2.11.0)
|
18
|
+
rspec-mocks (~> 2.11.0)
|
19
|
+
rspec-core (2.11.1)
|
20
|
+
rspec-expectations (2.11.2)
|
21
|
+
diff-lcs (~> 1.1.3)
|
22
|
+
rspec-mocks (2.11.2)
|
23
|
+
|
24
|
+
PLATFORMS
|
25
|
+
ruby
|
26
|
+
|
27
|
+
DEPENDENCIES
|
28
|
+
jeweler (= 1.8.4)
|
29
|
+
rspec (= 2.11)
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 David J. Brenes
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
Stopwords Filter
|
2
|
+
================
|
3
|
+
|
4
|
+
This project is a very simple and naive implementation of a stopwords filter that remove a list of banned words (stopwords) from a sentence.
|
5
|
+
|
6
|
+
Quick guide
|
7
|
+
-----------
|
8
|
+
|
9
|
+
* Install
|
10
|
+
|
11
|
+
just type
|
12
|
+
|
13
|
+
```
|
14
|
+
gem install stopwords-filter
|
15
|
+
```
|
16
|
+
|
17
|
+
or
|
18
|
+
|
19
|
+
```
|
20
|
+
gem 'stopwords-filter'
|
21
|
+
```
|
22
|
+
|
23
|
+
in your Gemfile.
|
24
|
+
|
25
|
+
* Use it
|
26
|
+
|
27
|
+
1. Simple version
|
28
|
+
|
29
|
+
```
|
30
|
+
stopwords = ['by', 'written', 'from']
|
31
|
+
filter = Stopwords::Filter.new stopwords
|
32
|
+
filter.filter 'guide by douglas adams'.split #-> ['guide', 'douglas', 'adams']
|
33
|
+
```
|
34
|
+
|
35
|
+
2. Snowball version
|
36
|
+
|
37
|
+
|
38
|
+
```
|
39
|
+
filter = Stopwords::Snowball::Filter.new "en"
|
40
|
+
filter.filter 'guide by douglas adams'.split #-> ['guide', 'douglas', 'adams']
|
41
|
+
```
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
What is a Stopword?
|
46
|
+
-------------------
|
47
|
+
|
48
|
+
According to [Wikipedia][wikipedia_stopwords]
|
49
|
+
|
50
|
+
> In computing, stop words are words which are filtered out prior to, or after, processing of natural language data (text).
|
51
|
+
|
52
|
+
And that's it. Words that are removed before you perform some task on the rest of them.
|
53
|
+
|
54
|
+
Why would I want to remove anything?
|
55
|
+
------------------------------------
|
56
|
+
|
57
|
+
Imagine you have a database of products and you want your customers to search on them. You can't use a proper search engine (such as [Solr][solr], [Sphinx][sphinx] or even [Google][google]) neither full search systems from popular database systems such as [PostgreSQL][postgre]. You are left alone with LIKEs and %.
|
58
|
+
|
59
|
+
You have your fake search engine working. Someone searches 'Guide Douglas Adams' and you find 'Douglas Adams - Hitchhiker's guide to the galaxy' everything is perfect.
|
60
|
+
|
61
|
+
But then someone searches 'guide by douglas adams' and you don't find anything. You don't have any 'by' in the description or title of the book! Most importantly, you don't need that 'by'!
|
62
|
+
|
63
|
+
You wish you could get rid of all those 'by' or 'written' or 'from', huh? That's why we are here!
|
64
|
+
|
65
|
+
How this thing works?
|
66
|
+
---------------------
|
67
|
+
|
68
|
+
Main class of this 'library' is Stopwords::Filter You just create a new object with an array of stopwords
|
69
|
+
|
70
|
+
```
|
71
|
+
stopwords = ['by', 'written', 'from']
|
72
|
+
filter = Stopwords::Filter.new stopwords
|
73
|
+
```
|
74
|
+
|
75
|
+
And then you have it, you just can filter
|
76
|
+
|
77
|
+
```
|
78
|
+
filter.filter 'guide by douglas adams'.split #-> ['guide', 'douglas', 'adams']
|
79
|
+
```
|
80
|
+
|
81
|
+
That's all?
|
82
|
+
-----------
|
83
|
+
|
84
|
+
I know what you're thinking, it takes a line of ruby code to filter one array from other. That's why we have added an extra functionality, [Snowball][wikipedia_snowball] stopwords lists, already built for you and ready to use.
|
85
|
+
|
86
|
+
How do I use that snowball thing?
|
87
|
+
---------------------------------
|
88
|
+
|
89
|
+
You just create the filter with the locale you want to use
|
90
|
+
|
91
|
+
```
|
92
|
+
filter = Stopwords::Snowball::Filter.new "en"
|
93
|
+
```
|
94
|
+
|
95
|
+
And then you filter without worrying about the exact stopwords used
|
96
|
+
|
97
|
+
```
|
98
|
+
filter.filter 'guide by douglas adams'.split #-> ['guide', 'douglas', 'adams']
|
99
|
+
```
|
100
|
+
|
101
|
+
Anything else?
|
102
|
+
--------------
|
103
|
+
|
104
|
+
In a future version I would like to include a chaining filter where you include a series of operations and they are executed in a lineal order, just like the [Pipes and Filters design pattern][wikipedia_pipes_filters]
|
105
|
+
|
106
|
+
|
107
|
+
[wikipedia_stopwords]: http://en.wikipedia.org/wiki/Stopword
|
108
|
+
[solr]: https://github.com/sunspot/sunspot
|
109
|
+
[sphinx]: https://github.com/freelancing-god/thinking-sphinx
|
110
|
+
[google]: https://github.com/alexreisner/google_custom_search
|
111
|
+
[postgre]: https://github.com/Casecommons/pg_search
|
112
|
+
[wikipedia_snowball]: http://en.wikipedia.org/wiki/Snowball_programming_language
|
113
|
+
[wikipedia_pipes_filters]: http://en.wikipedia.org/wiki/Pipes_and_filters
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/lib/stopwords.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
class Stopwords::Snowball::Filter < Stopwords::Filter
|
2
|
+
|
3
|
+
|
4
|
+
attr_reader :locale
|
5
|
+
attr_reader :locale_filename
|
6
|
+
|
7
|
+
def initialize locale
|
8
|
+
|
9
|
+
@locale = locale
|
10
|
+
@locale_filename = "#{File.dirname(__FILE__)}/locales/#{locale}.csv"
|
11
|
+
|
12
|
+
raise "Unknown locale" unless File.exists?(@locale_filename)
|
13
|
+
|
14
|
+
super File.read(@locale_filename).split(",")
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
og,i,jeg,det,at,en,den,til,er,som,på,de,med,han,af,for,ikke,der,var,mig,sig,men,et,har,om,vi,min,havde,ham,hun,nu,over,da,fra,du,ud,sin,dem,os,op,man,hans,hvor,eller,hvad,skal,selv,her,alle,vil,blev,kunne,ind,når,være,dog,noget,ville,jo,deres,efter,ned,skulle,denne,end,dette,mit,også,under,have,dig,anden,hende,mine,alt,meget,sit,sine,vor,mod,disse,hvis,din,nogle,hos,blive,mange,ad,bliver,hendes,været,thi,jer,sådan
|
@@ -0,0 +1 @@
|
|
1
|
+
aber,alle,allem,allen,aller,alles,als,also,am,an,ander,andere,anderem,anderen,anderer,anderes,anderm,andern,anderr,anders,auch,auf,aus,bei,bin,bis,bist,da,damit,dann,der,den,des,dem,die,das,daß,derselbe,derselben,denselben,desselben,demselben,dieselbe,dieselben,dasselbe,dazu,dein,deine,deinem,deinen,deiner,deines,denn,derer,dessen,dich,dir,du,dies,diese,diesem,diesen,dieser,dieses,doch,dort,durch,ein,eine,einem,einen,einer,eines,einig,einige,einigem,einigen,einiger,einiges,einmal,er,ihn,ihm,es,etwas,euer,eure,eurem,euren,eurer,eures,für,gegen,gewesen,hab,habe,haben,hat,hatte,hatten,hier,hin,hinter,ich,mich,mir,ihr,ihre,ihrem,ihren,ihrer,ihres,euch,im,in,indem,ins,ist,jede,jedem,jeden,jeder,jedes,jene,jenem,jenen,jener,jenes,jetzt,kann,kein,keine,keinem,keinen,keiner,keines,können,könnte,machen,man,manche,manchem,manchen,mancher,manches,mein,meine,meinem,meinen,meiner,meines,mit,muss,musste,nach,nicht,nichts,noch,nun,nur,ob,oder,ohne,sehr,sein,seine,seinem,seinen,seiner,seines,selbst,sich,sie,ihnen,sind,so,solche,solchem,solchen,solcher,solches,soll,sollte,sondern,sonst,über,um,und,uns,unse,unsem,unsen,unser,unses,unter,viel,vom,von,vor,während,war,waren,warst,was,weg,weil,weiter,welche,welchem,welchen,welcher,welches,wenn,werde,werden,wie,wieder,will,wir,wird,wirst,wo,wollen,wollte,würde,würden,zu,zum,zur,zwar,zwischen
|
@@ -0,0 +1 @@
|
|
1
|
+
i,me,my,myself,we,our,ours,ourselves,you,your,yours,yourself,yourselves,he,him,his,himself,she,her,hers,herself,it,its,itself,they,them,their,theirs,themselves,what,which,who,whom,this,that,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,would,should,could,ought,i'm,you're,he's,she's,it's,we're,they're,i've,you've,we've,they've,i'd,you'd,he'd,she'd,we'd,they'd,i'll,you'll,he'll,she'll,we'll,they'll,isn't,aren't,wasn't,weren't,hasn't,haven't,hadn't,doesn't,don't,didn't,won't,wouldn't,shan't,shouldn't,can't,cannot,couldn't,mustn't,let's,that's,who's,what's,here's,there's,when's,where's,why's,how's,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very
|
@@ -0,0 +1 @@
|
|
1
|
+
de,la,que,el,en,y,a,los,del,se,las,por,un,para,con,no,una,su,al,lo,como,más,pero,sus,le,ya,o,este,sí,porque,esta,entre,cuando,muy,sin,sobre,también,me,hasta,hay,donde,quien,desde,todo,nos,durante,todos,uno,les,ni,contra,otros,ese,eso,ante,ellos,e,esto,mí,antes,algunos,qué,unos,yo,otro,otras,otra,él,tanto,esa,estos,mucho,quienes,nada,muchos,cual,poco,ella,estar,estas,algunas,algo,nosotros,mi,mis,tú,te,ti,tu,tus,ellas,nosotras,vosotros,vosotras,os,mío,mía,míos,mías,tuyo,tuya,tuyos,tuyas,suyo,suya,suyos,suyas,nuestro,nuestra,nuestros,nuestras,vuestro,vuestra,vuestros,vuestras,esos,esas,estoy,estás,está,estamos,estáis,están,esté,estés,estemos,estéis,estén,estaré,estarás,estará,estaremos,estaréis,estarán,estaría,estarías,estaríamos,estaríais,estarían,estaba,estabas,estábamos,estabais,estaban,estuve,estuviste,estuvo,estuvimos,estuvisteis,estuvieron,estuviera,estuvieras,estuviéramos,estuvierais,estuvieran,estuviese,estuvieses,estuviésemos,estuvieseis,estuviesen,estando,estado,estada,estados,estadas,estad,he,has,ha,hemos,habéis,han,haya,hayas,hayamos,hayáis,hayan,habré,habrás,habrá,habremos,habréis,habrán,habría,habrías,habríamos,habríais,habrían,había,habías,habíamos,habíais,habían,hube,hubiste,hubo,hubimos,hubisteis,hubieron,hubiera,hubieras,hubiéramos,hubierais,hubieran,hubiese,hubieses,hubiésemos,hubieseis,hubiesen,habiendo,habido,habida,habidos,habidas,soy,eres,es,somos,sois,son,sea,seas,seamos,seáis,sean,seré,serás,será,seremos,seréis,serán,sería,serías,seríamos,seríais,serían,era,eras,éramos,erais,eran,fui,fuiste,fue,fuimos,fuisteis,fueron,fuera,fueras,fuéramos,fuerais,fueran,fuese,fueses,fuésemos,fueseis,fuesen,siendo,sido,tengo,tienes,tiene,tenemos,tenéis,tienen,tenga,tengas,tengamos,tengáis,tengan,tendré,tendrás,tendrá,tendremos,tendréis,tendrán,tendría,tendrías,tendríamos,tendríais,tendrían,tenía,tenías,teníamos,teníais,tenían,tuve,tuviste,tuvo,tuvimos,tuvisteis,tuvieron,tuviera,tuvieras,tuviéramos,tuvierais,tuvieran,tuviese,tuvieses,tuviésemos,tuvieseis,tuviesen,teniendo,tenido,tenida,tenidos,tenidas,tened
|
@@ -0,0 +1 @@
|
|
1
|
+
olla,olen,olet,on,olemme,olette,ovat,ole,oli,olisi,olisit,olisin,olisimme,olisitte,olisivat,olit,olin,olimme,olitte,olivat,ollut,olleet,en,et,ei,emme,ette,eivät,minä,minun,minut,minua,minussa,minusta,minuun,minulla,minulta,minulle,sinä,sinun,sinut,sinua,sinussa,sinusta,sinuun,sinulla,sinulta,sinulle,hän,hänen,hänet,häntä,hänessä,hänestä,häneen,hänellä,häneltä,hänelle,me,meidän,meidät,meitä,meissä,meistä,meihin,meillä,meiltä,meille,te,teidän,teidät,teitä,teissä,teistä,teihin,teillä,teiltä,teille,he,heidän,heidät,heitä,heissä,heistä,heihin,heillä,heiltä,heille,tämä,tämän,tätä,tässä,tästä,tähän,tällä,tältä,tälle,tänä,täksi,tuo,tuon,tuota,tuossa,tuosta,tuohon,tuolla,tuolta,tuolle,tuona,tuoksi,se,sen,sitä,siinä,siitä,siihen,sillä,siltä,sille,sinä,siksi,nämä,näiden,näitä,näissä,näistä,näihin,näillä,näiltä,näille,näinä,näiksi,nuo,noiden,noita,noissa,noista,noihin,noilla,noilta,noille,noina,noiksi,ne,niiden,niitä,niissä,niistä,niihin,niillä,niiltä,niille,niinä,niiksi,kuka,kenen,kenet,ketä,kenessä,kenestä,keneen,kenellä,keneltä,kenelle,kenenä,keneksi,ketkä,keiden,ketkä,keitä,keissä,keistä,keihin,keillä,keiltä,keille,keinä,keiksi,mikä,minkä,minkä,mitä,missä,mistä,mihin,millä,miltä,mille,minä,miksi,mitkä,joka,jonka,jota,jossa,josta,johon,jolla,jolta,jolle,jona,joksi,jotka,joiden,joita,joissa,joista,joihin,joilla,joilta,joille,joina,joiksi,että,ja,jos,koska,kuin,mutta,niin,sekä,sillä,tai,vaan,vai,vaikka,kanssa,mukaan,noin,poikki,yli,kun,niin,nyt,itse
|
@@ -0,0 +1 @@
|
|
1
|
+
au,aux,avec,ce,ces,dans,de,des,du,elle,en,et,eux,il,je,la,le,leur,lui,ma,mais,me,même,mes,moi,mon,ne,nos,notre,nous,on,ou,par,pas,pour,qu,que,qui,sa,se,ses,son,sur,ta,te,tes,toi,ton,tu,un,une,vos,votre,vous,c,d,j,l,à,m,n,s,t,y,été,étée,étées,étés,étant,suis,es,est,sommes,êtes,sont,serai,seras,sera,serons,serez,seront,serais,serait,serions,seriez,seraient,étais,était,étions,étiez,étaient,fus,fut,fûmes,fûtes,furent,sois,soit,soyons,soyez,soient,fusse,fusses,fût,fussions,fussiez,fussent,ayant,eu,eue,eues,eus,ai,as,avons,avez,ont,aurai,auras,aura,aurons,aurez,auront,aurais,aurait,aurions,auriez,auraient,avais,avait,avions,aviez,avaient,eut,eûmes,eûtes,eurent,aie,aies,ait,ayons,ayez,aient,eusse,eusses,eût,eussions,eussiez,eussent,ceci,celà,cet,cette,ici,ils,les,leurs,quel,quels,quelle,quelles,sans,soi
|
@@ -0,0 +1 @@
|
|
1
|
+
a,ahogy,ahol,aki,akik,akkor,alatt,által,általában,amely,amelyek,amelyekben,amelyeket,amelyet,amelynek,ami,amit,amolyan,amíg,amikor,át,abban,ahhoz,annak,arra,arról,az,azok,azon,azt,azzal,azért,aztán,azután,azonban,bár,be,belül,benne,cikk,cikkek,cikkeket,csak,de,e,eddig,egész,egy,egyes,egyetlen,egyéb,egyik,egyre,ekkor,el,elég,ellen,elõ,elõször,elõtt,elsõ,én,éppen,ebben,ehhez,emilyen,ennek,erre,ez,ezt,ezek,ezen,ezzel,ezért,és,fel,felé,hanem,hiszen,hogy,hogyan,igen,így,illetve,ill.,ill,ilyen,ilyenkor,ison,ismét,itt,jó,jól,jobban,kell,kellett,keresztül,keressünk,ki,kívül,között,közül,legalább,lehet,lehetett,legyen,lenne,lenni,lesz,lett,maga,magát,majd,majd,már,más,másik,meg,még,mellett,mert,mely,melyek,mi,mit,míg,miért,milyen,mikor,minden,mindent,mindenki,mindig,mint,mintha,mivel,most,nagy,nagyobb,nagyon,ne,néha,nekem,neki,nem,néhány,nélkül,nincs,olyan,ott,össze,õ,õk,õket,pedig,persze,rá,s,saját,sem,semmi,sok,sokat,sokkal,számára,szemben,szerint,szinte,talán,tehát,teljes,tovább,továbbá,több,úgy,ugyanis,új,újabb,újra,után,utána,utolsó,vagy,vagyis,valaki,valami,valamint,való,vagyok,van,vannak,volt,voltam,voltak,voltunk,vissza,vele,viszont,volna
|
@@ -0,0 +1 @@
|
|
1
|
+
ad,al,allo,ai,agli,all,agl,alla,alle,con,col,coi,da,dal,dallo,dai,dagli,dall,dagl,dalla,dalle,di,del,dello,dei,degli,dell,degl,della,delle,in,nel,nello,nei,negli,nell,negl,nella,nelle,su,sul,sullo,sui,sugli,sull,sugl,sulla,sulle,per,tra,contro,io,tu,lui,lei,noi,voi,loro,mio,mia,miei,mie,tuo,tua,tuoi,tue,suo,sua,suoi,sue,nostro,nostra,nostri,nostre,vostro,vostra,vostri,vostre,mi,ti,ci,vi,lo,la,li,le,gli,ne,il,un,uno,una,ma,ed,se,perché,anche,come,dov,dove,che,chi,cui,non,più,quale,quanto,quanti,quanta,quante,quello,quelli,quella,quelle,questo,questi,questa,queste,si,tutto,tutti,a,c,e,i,l,o,ho,hai,ha,abbiamo,avete,hanno,abbia,abbiate,abbiano,avrò,avrai,avrà,avremo,avrete,avranno,avrei,avresti,avrebbe,avremmo,avreste,avrebbero,avevo,avevi,aveva,avevamo,avevate,avevano,ebbi,avesti,ebbe,avemmo,aveste,ebbero,avessi,avesse,avessimo,avessero,avendo,avuto,avuta,avuti,avute,sono,sei,è,siamo,siete,sia,siate,siano,sarò,sarai,sarà,saremo,sarete,saranno,sarei,saresti,sarebbe,saremmo,sareste,sarebbero,ero,eri,era,eravamo,eravate,erano,fui,fosti,fu,fummo,foste,furono,fossi,fosse,fossimo,fossero,essendo,faccio,fai,facciamo,fanno,faccia,facciate,facciano,farò,farai,farà,faremo,farete,faranno,farei,faresti,farebbe,faremmo,fareste,farebbero,facevo,facevi,faceva,facevamo,facevate,facevano,feci,facesti,fece,facemmo,faceste,fecero,facessi,facesse,facessimo,facessero,facendo,sto,stai,sta,stiamo,stanno,stia,stiate,stiano,starò,starai,starà,staremo,starete,staranno,starei,staresti,starebbe,staremmo,stareste,starebbero,stavo,stavi,stava,stavamo,stavate,stavano,stetti,stesti,stette,stemmo,steste,stettero,stessi,stesse,stessimo,stessero,stando
|
@@ -0,0 +1 @@
|
|
1
|
+
de,en,van,ik,te,dat,die,in,een,hij,het,niet,zijn,is,was,op,aan,met,als,voor,had,er,maar,om,hem,dan,zou,of,wat,mijn,men,dit,zo,door,over,ze,zich,bij,ook,tot,je,mij,uit,der,daar,haar,naar,heb,hoe,heeft,hebben,deze,u,want,nog,zal,me,zij,nu,ge,geen,omdat,iets,worden,toch,al,waren,veel,meer,doen,toen,moet,ben,zonder,kan,hun,dus,alles,onder,ja,eens,hier,wie,werd,altijd,doch,wordt,wezen,kunnen,ons,zelf,tegen,na,reeds,wil,kon,niets,uw,iemand,geweest,andere,
|
@@ -0,0 +1 @@
|
|
1
|
+
de,a,o,que,e,do,da,em,um,para,com,não,uma,os,no,se,na,por,mais,as,dos,como,mas,ao,ele,das,à,seu,sua,ou,quando,muito,nos,já,eu,também,só,pelo,pela,até,isso,ela,entre,depois,sem,mesmo,aos,seus,quem,nas,me,esse,eles,você,essa,num,nem,suas,meu,às,minha,numa,pelos,elas,qual,nós,lhe,deles,essas,esses,pelas,este,dele,tu,te,vocês,vos,lhes,meus,minhas,teu,tua,teus,tuas,nosso,nossa,nossos,nossas,dela,delas,esta,estes,estas,aquele,aquela,aqueles,aquelas,isto,aquilo,estou,está,estamos,estão,estive,esteve,estivemos,estiveram,estava,estávamos,estavam,estivera,estivéramos,esteja,estejamos,estejam,estivesse,estivéssemos,estivessem,estiver,estivermos,estiverem,hei,há,havemos,hão,houve,houvemos,houveram,houvera,houvéramos,haja,hajamos,hajam,houvesse,houvéssemos,houvessem,houver,houvermos,houverem,houverei,houverá,houveremos,houverão,houveria,houveríamos,houveriam,sou,somos,são,era,éramos,eram,fui,foi,fomos,foram,fora,fôramos,seja,sejamos,sejam,fosse,fôssemos,fossem,for,formos,forem,serei,será,seremos,serão,seria,seríamos,seriam,tenho,tem,temos,tém,tinha,tínhamos,tinham,tive,teve,tivemos,tiveram,tivera,tivéramos,tenha,tenhamos,tenham,tivesse,tivéssemos,tivessem,tiver,tivermos,tiverem,terei,terá,teremos,terão,teria,teríamos,teriam
|
@@ -0,0 +1,159 @@
|
|
1
|
+
É
|
2
|
+
×
|
3
|
+
×Ï
|
4
|
+
ÎÅ
|
5
|
+
ÞÔÏ
|
6
|
+
ÏÎ
|
7
|
+
ÎÁ
|
8
|
+
Ñ
|
9
|
+
Ó
|
10
|
+
ÓÏ
|
11
|
+
ËÁË
|
12
|
+
Á
|
13
|
+
ÔÏ
|
14
|
+
×ÓÅ
|
15
|
+
ÏÎÁ
|
16
|
+
ÔÁË
|
17
|
+
ÅÇÏ
|
18
|
+
ÎÏ
|
19
|
+
ÄÁ
|
20
|
+
ÔÙ
|
21
|
+
Ë
|
22
|
+
Õ
|
23
|
+
ÖÅ
|
24
|
+
×Ù
|
25
|
+
ÚÁ
|
26
|
+
ÂÙ
|
27
|
+
ÐÏ
|
28
|
+
ÔÏÌØËÏ
|
29
|
+
ÅÅ
|
30
|
+
ÍÎÅ
|
31
|
+
ÂÙÌÏ
|
32
|
+
×ÏÔ
|
33
|
+
ÏÔ
|
34
|
+
ÍÅÎÑ
|
35
|
+
ÅÝÅ
|
36
|
+
ÎÅÔ
|
37
|
+
Ï
|
38
|
+
ÉÚ
|
39
|
+
ÅÍÕ
|
40
|
+
ÔÅÐÅÒØ
|
41
|
+
ËÏÇÄÁ
|
42
|
+
ÄÁÖÅ
|
43
|
+
ÎÕ
|
44
|
+
×ÄÒÕÇ
|
45
|
+
ÌÉ
|
46
|
+
ÅÓÌÉ
|
47
|
+
ÕÖÅ
|
48
|
+
ÉÌÉ
|
49
|
+
ÎÉ
|
50
|
+
ÂÙÔØ
|
51
|
+
ÂÙÌ
|
52
|
+
ÎÅÇÏ
|
53
|
+
ÄÏ
|
54
|
+
×ÁÓ
|
55
|
+
ÎÉÂÕÄØ
|
56
|
+
ÏÐÑÔØ
|
57
|
+
ÕÖ
|
58
|
+
×ÁÍ
|
59
|
+
ÓËÁÚÁÌ
|
60
|
+
×ÅÄØ
|
61
|
+
ÔÁÍ
|
62
|
+
ÐÏÔÏÍ
|
63
|
+
ÓÅÂÑ
|
64
|
+
ÎÉÞÅÇÏ
|
65
|
+
ÅÊ
|
66
|
+
ÍÏÖÅÔ
|
67
|
+
ÏÎÉ
|
68
|
+
ÔÕÔ
|
69
|
+
ÇÄÅ
|
70
|
+
ÅÓÔØ
|
71
|
+
ÎÁÄÏ
|
72
|
+
ÎÅÊ
|
73
|
+
ÄÌÑ
|
74
|
+
ÍÙ
|
75
|
+
ÔÅÂÑ
|
76
|
+
ÉÈ
|
77
|
+
ÞÅÍ
|
78
|
+
ÂÙÌÁ
|
79
|
+
ÓÁÍ
|
80
|
+
ÞÔÏÂ
|
81
|
+
ÂÅÚ
|
82
|
+
ÂÕÄÔÏ
|
83
|
+
ÞÅÌÏ×ÅË
|
84
|
+
ÞÅÇÏ
|
85
|
+
ÒÁÚ
|
86
|
+
ÔÏÖÅ
|
87
|
+
ÓÅÂÅ
|
88
|
+
ÐÏÄ
|
89
|
+
ÖÉÚÎØ
|
90
|
+
ÂÕÄÅÔ
|
91
|
+
Ö
|
92
|
+
ÔÏÇÄÁ
|
93
|
+
ËÔÏ
|
94
|
+
ÜÔÏÔ
|
95
|
+
ÇÏ×ÏÒÉÌ
|
96
|
+
ÔÏÇÏ
|
97
|
+
ÐÏÔÏÍÕ
|
98
|
+
ÜÔÏÇÏ
|
99
|
+
ËÁËÏÊ
|
100
|
+
ÓÏ×ÓÅÍ
|
101
|
+
ÎÉÍ
|
102
|
+
ÚÄÅÓØ
|
103
|
+
ÜÔÏÍ
|
104
|
+
ÏÄÉÎ
|
105
|
+
ÐÏÞÔÉ
|
106
|
+
ÍÏÊ
|
107
|
+
ÔÅÍ
|
108
|
+
ÞÔÏÂÙ
|
109
|
+
ÎÅÅ
|
110
|
+
ËÁÖÅÔÓÑ
|
111
|
+
ÓÅÊÞÁÓ
|
112
|
+
ÂÙÌÉ
|
113
|
+
ËÕÄÁ
|
114
|
+
ÚÁÞÅÍ
|
115
|
+
ÓËÁÚÁÔØ
|
116
|
+
×ÓÅÈ
|
117
|
+
ÎÉËÏÇÄÁ
|
118
|
+
ÓÅÇÏÄÎÑ
|
119
|
+
ÍÏÖÎÏ
|
120
|
+
ÐÒÉ
|
121
|
+
ÎÁËÏÎÅÃ
|
122
|
+
Ä×Á
|
123
|
+
ÏÂ
|
124
|
+
ÄÒÕÇÏÊ
|
125
|
+
ÈÏÔØ
|
126
|
+
ÐÏÓÌÅ
|
127
|
+
ÎÁÄ
|
128
|
+
ÂÏÌØÛÅ
|
129
|
+
ÔÏÔ
|
130
|
+
ÞÅÒÅÚ
|
131
|
+
ÜÔÉ
|
132
|
+
ÎÁÓ
|
133
|
+
ÐÒÏ
|
134
|
+
×ÓÅÇÏ
|
135
|
+
ÎÉÈ
|
136
|
+
ËÁËÁÑ
|
137
|
+
ÍÎÏÇÏ
|
138
|
+
ÒÁÚ×Å
|
139
|
+
ÓËÁÚÁÌÁ
|
140
|
+
ÔÒÉ
|
141
|
+
ÜÔÕ
|
142
|
+
ÍÏÑ
|
143
|
+
×ÐÒÏÞÅÍ
|
144
|
+
ÈÏÒÏÛÏ
|
145
|
+
Ó×ÏÀ
|
146
|
+
ÜÔÏÊ
|
147
|
+
ÐÅÒÅÄ
|
148
|
+
ÉÎÏÇÄÁ
|
149
|
+
ÌÕÞÛÅ
|
150
|
+
ÞÕÔØ
|
151
|
+
ÔÏÍ
|
152
|
+
ÎÅÌØÚÑ
|
153
|
+
ÔÁËÏÊ
|
154
|
+
ÉÍ
|
155
|
+
ÂÏÌÅÅ
|
156
|
+
×ÓÅÇÄÁ
|
157
|
+
ËÏÎÅÞÎÏ
|
158
|
+
×ÓÀ
|
159
|
+
ÍÅÖÄÕ
|
@@ -0,0 +1,114 @@
|
|
1
|
+
och
|
2
|
+
det
|
3
|
+
att
|
4
|
+
i
|
5
|
+
en
|
6
|
+
jag
|
7
|
+
hon
|
8
|
+
som
|
9
|
+
han
|
10
|
+
på
|
11
|
+
den
|
12
|
+
med
|
13
|
+
var
|
14
|
+
sig
|
15
|
+
för
|
16
|
+
så
|
17
|
+
till
|
18
|
+
är
|
19
|
+
men
|
20
|
+
ett
|
21
|
+
om
|
22
|
+
hade
|
23
|
+
de
|
24
|
+
av
|
25
|
+
icke
|
26
|
+
mig
|
27
|
+
du
|
28
|
+
henne
|
29
|
+
då
|
30
|
+
sin
|
31
|
+
nu
|
32
|
+
har
|
33
|
+
inte
|
34
|
+
hans
|
35
|
+
honom
|
36
|
+
skulle
|
37
|
+
hennes
|
38
|
+
där
|
39
|
+
min
|
40
|
+
man
|
41
|
+
ej
|
42
|
+
vid
|
43
|
+
kunde
|
44
|
+
något
|
45
|
+
från
|
46
|
+
ut
|
47
|
+
när
|
48
|
+
efter
|
49
|
+
upp
|
50
|
+
vi
|
51
|
+
dem
|
52
|
+
vara
|
53
|
+
vad
|
54
|
+
över
|
55
|
+
än
|
56
|
+
dig
|
57
|
+
kan
|
58
|
+
sina
|
59
|
+
här
|
60
|
+
ha
|
61
|
+
mot
|
62
|
+
alla
|
63
|
+
under
|
64
|
+
någon
|
65
|
+
eller
|
66
|
+
allt
|
67
|
+
mycket
|
68
|
+
sedan
|
69
|
+
ju
|
70
|
+
denna
|
71
|
+
själv
|
72
|
+
detta
|
73
|
+
åt
|
74
|
+
utan
|
75
|
+
varit
|
76
|
+
hur
|
77
|
+
ingen
|
78
|
+
mitt
|
79
|
+
ni
|
80
|
+
bli
|
81
|
+
blev
|
82
|
+
oss
|
83
|
+
din
|
84
|
+
dessa
|
85
|
+
några
|
86
|
+
deras
|
87
|
+
blir
|
88
|
+
mina
|
89
|
+
samma
|
90
|
+
vilken
|
91
|
+
er
|
92
|
+
sådan
|
93
|
+
vår
|
94
|
+
blivit
|
95
|
+
dess
|
96
|
+
inom
|
97
|
+
mellan
|
98
|
+
sådant
|
99
|
+
varför
|
100
|
+
varje
|
101
|
+
vilka
|
102
|
+
ditt
|
103
|
+
vem
|
104
|
+
vilket
|
105
|
+
sitta
|
106
|
+
sådana
|
107
|
+
vart
|
108
|
+
dina
|
109
|
+
vars
|
110
|
+
vårt
|
111
|
+
våra
|
112
|
+
ert
|
113
|
+
era
|
114
|
+
vilkas
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require_relative("../spec_helper.rb")
|
2
|
+
|
3
|
+
describe Stopwords::Filter do
|
4
|
+
|
5
|
+
context "when fed with a list of arbitrary words" do
|
6
|
+
|
7
|
+
let (:filter) { Stopwords::Filter.new stopwords }
|
8
|
+
let (:stopwords) { ["a", "desde"] }
|
9
|
+
|
10
|
+
subject { filter }
|
11
|
+
|
12
|
+
it("should remove the stopwords for the list of words to be filtered") { filter.filter("desde Santurce a Bilbao".split).should == ["Santurce", "Bilbao"]}
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require_relative("../spec_helper.rb")
|
3
|
+
|
4
|
+
describe Stopwords::Snowball::Filter do
|
5
|
+
|
6
|
+
context "when pointed to a valid locale" do
|
7
|
+
|
8
|
+
let (:filter) { Stopwords::Snowball::Filter.new "es" }
|
9
|
+
|
10
|
+
subject { filter }
|
11
|
+
|
12
|
+
its(:stopwords) { should == ["de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por", "un", "para", "con", "no", "una", "su", "al", "lo", "como", "más", "pero", "sus", "le", "ya", "o", "este", "sí", "porque", "esta", "entre", "cuando", "muy", "sin", "sobre", "también", "me", "hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante", "todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante", "ellos", "e", "esto", "mí", "antes", "algunos", "qué", "unos", "yo", "otro", "otras", "otra", "él", "tanto", "esa", "estos", "mucho", "quienes", "nada", "muchos", "cual", "poco", "ella", "estar", "estas", "algunas", "algo", "nosotros", "mi", "mis", "tú", "te", "ti", "tu", "tus", "ellas", "nosotras", "vosotros", "vosotras", "os", "mío", "mía", "míos", "mías", "tuyo", "tuya", "tuyos", "tuyas", "suyo", "suya", "suyos", "suyas", "nuestro", "nuestra", "nuestros", "nuestras", "vuestro", "vuestra", "vuestros", "vuestras", "esos", "esas", "estoy", "estás", "está", "estamos", "estáis", "están", "esté", "estés", "estemos", "estéis", "estén", "estaré", "estarás", "estará", "estaremos", "estaréis", "estarán", "estaría", "estarías", "estaríamos", "estaríais", "estarían", "estaba", "estabas", "estábamos", "estabais", "estaban", "estuve", "estuviste", "estuvo", "estuvimos", "estuvisteis", "estuvieron", "estuviera", "estuvieras", "estuviéramos", "estuvierais", "estuvieran", "estuviese", "estuvieses", "estuviésemos", "estuvieseis", "estuviesen", "estando", "estado", "estada", "estados", "estadas", "estad", "he", "has", "ha", "hemos", "habéis", "han", "haya", "hayas", "hayamos", "hayáis", "hayan", "habré", "habrás", "habrá", "habremos", "habréis", "habrán", "habría", "habrías", "habríamos", "habríais", "habrían", "había", "habías", "habíamos", "habíais", "habían", "hube", "hubiste", "hubo", "hubimos", "hubisteis", "hubieron", "hubiera", "hubieras", "hubiéramos", "hubierais", "hubieran", "hubiese", "hubieses", "hubiésemos", "hubieseis", "hubiesen", "habiendo", "habido", "habida", "habidos", "habidas", "soy", "eres", "es", "somos", "sois", "son", "sea", "seas", "seamos", "seáis", "sean", "seré", "serás", "será", "seremos", "seréis", "serán", "sería", "serías", "seríamos", "seríais", "serían", "era", "eras", "éramos", "erais", "eran", "fui", "fuiste", "fue", "fuimos", "fuisteis", "fueron", "fuera", "fueras", "fuéramos", "fuerais", "fueran", "fuese", "fueses", "fuésemos", "fueseis", "fuesen", "siendo", "sido", "tengo", "tienes", "tiene", "tenemos", "tenéis", "tienen", "tenga", "tengas", "tengamos", "tengáis", "tengan", "tendré", "tendrás", "tendrá", "tendremos", "tendréis", "tendrán", "tendría", "tendrías", "tendríamos", "tendríais", "tendrían", "tenía", "tenías", "teníamos", "teníais", "tenían", "tuve", "tuviste", "tuvo", "tuvimos", "tuvisteis", "tuvieron", "tuviera", "tuvieras", "tuviéramos", "tuvierais", "tuvieran", "tuviese", "tuvieses", "tuviésemos", "tuvieseis", "tuviesen", "teniendo", "tenido", "tenida", "tenidos", "tenidas", "tened"]}
|
13
|
+
|
14
|
+
it("should remove the stopwords for the list of words to be filtered") { filter.filter("desde Santurce a Bilbao".split).should == ["Santurce", "Bilbao"]}
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require_relative '../lib/stopwords'
|
metadata
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: stopwords-filter
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.1.0
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- David J. Brenes
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2012-08-12 00:00:00 Z
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Small library that allows you to create a simple stopwords filter or use some based on Snowball stopwords lists
|
17
|
+
email: davidjbrenes@gmail.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- LICENSE.txt
|
24
|
+
- README.md
|
25
|
+
files:
|
26
|
+
- Gemfile
|
27
|
+
- Gemfile.lock
|
28
|
+
- README.md
|
29
|
+
- VERSION
|
30
|
+
- lib/stopwords.rb
|
31
|
+
- lib/stopwords/filter.rb
|
32
|
+
- lib/stopwords/snowball.rb
|
33
|
+
- lib/stopwords/snowball/filter.rb
|
34
|
+
- lib/stopwords/snowball/locales/da.csv
|
35
|
+
- lib/stopwords/snowball/locales/de.csv
|
36
|
+
- lib/stopwords/snowball/locales/en.csv
|
37
|
+
- lib/stopwords/snowball/locales/es.csv
|
38
|
+
- lib/stopwords/snowball/locales/fn.csv
|
39
|
+
- lib/stopwords/snowball/locales/fr.csv
|
40
|
+
- lib/stopwords/snowball/locales/hu.csv
|
41
|
+
- lib/stopwords/snowball/locales/it.csv
|
42
|
+
- lib/stopwords/snowball/locales/nl.csv
|
43
|
+
- lib/stopwords/snowball/locales/pt.csv
|
44
|
+
- lib/stopwords/snowball/locales/ru.csv
|
45
|
+
- lib/stopwords/snowball/locales/sv.csv
|
46
|
+
- spec/lib/filter_spec.rb
|
47
|
+
- spec/lib/snowball_filter_spec.rb
|
48
|
+
- spec/spec_helper.rb
|
49
|
+
- LICENSE.txt
|
50
|
+
homepage: http://github.com/brenes/stopwords-filter
|
51
|
+
licenses:
|
52
|
+
- MIT
|
53
|
+
post_install_message:
|
54
|
+
rdoc_options: []
|
55
|
+
|
56
|
+
require_paths:
|
57
|
+
- lib
|
58
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
59
|
+
none: false
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: "0"
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: "0"
|
70
|
+
requirements: []
|
71
|
+
|
72
|
+
rubyforge_project:
|
73
|
+
rubygems_version: 1.8.24
|
74
|
+
signing_key:
|
75
|
+
specification_version: 3
|
76
|
+
summary: Snowball based filters for stopwords
|
77
|
+
test_files: []
|
78
|
+
|