stopwords-filter 0.2.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG +12 -0
- data/Gemfile +1 -1
- data/Gemfile.lock +21 -15
- data/README.md +4 -1
- data/VERSION +1 -1
- data/lib/stopwords/filter.rb +14 -10
- data/lib/stopwords/snowball/filter.rb +15 -18
- data/lib/stopwords/snowball/locales/el.csv +1 -0
- data/lib/stopwords/snowball/locales/pl.csv +1 -0
- data/lib/stopwords/snowball/locales/ro.csv +1 -0
- data/lib/stopwords/snowball/locales/ru.csv +1 -159
- data/lib/stopwords/snowball/locales/sv.csv +1 -114
- data/lib/stopwords/snowball/wordsieve.rb +17 -13
- data/spec/lib/filter_spec.rb +2 -2
- data/spec/lib/snowball_filter_spec.rb +24 -3
- metadata +27 -32
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ab6ffb4c393600755245462bd24815191b9a589a
|
4
|
+
data.tar.gz: 0a23aeeb9caaf8c65341a7bb7f2e8760fe8df9a9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f6d2c6f8524c2fceb16fa069fa95458e6346ebc0a0cd8c10746d018a4f0339b05bb91511e0edc6ca87b5f55e84d689f78fcdba174ef1c7e2d60926d2e29704ae
|
7
|
+
data.tar.gz: 3b51a3de282c95b86d1d0720d885d375cd21976af16ae987ffab49e0e0b9d380597f0170a5d76949f4529d96db63e6ba6d85a24f2278bbfb9b7b8d8aae83faf1
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
0.5.0:
|
2
|
+
* Added Greek stopwords based on Lucene [@vrypan] [#13]
|
3
|
+
* Fixed CSV format for sv and ru locales [@woto] [#14]
|
4
|
+
* Fixed romanian and polish CSV not being included in the gemspec.
|
5
|
+
0.4.1:
|
6
|
+
* Dummy version to fix metadata in gemspec
|
7
|
+
0.4.0:
|
8
|
+
* Added Polish stopwords based on https://pl.wikipedia.org/wiki/Wikipedia:Stopwords [@grzegorzblaszczyk] [#9]
|
9
|
+
* Added info when no locale is found on snowball filter [@nerde] [#10]
|
10
|
+
0.3.0:
|
11
|
+
* Added custom words list to snowball filter [@sbeckeriv]
|
12
|
+
* Fixed problem about initialization: https://github.com/brenes/stopwords-filter/issues/3 [@zackxu1]
|
1
13
|
0.2.0:
|
2
14
|
* Added stopword? method [@s2gatev]
|
3
15
|
* Added Sieve class [@s2gatev]
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,29 +1,35 @@
|
|
1
1
|
GEM
|
2
2
|
remote: https://rubygems.org/
|
3
3
|
specs:
|
4
|
-
diff-lcs (1.
|
5
|
-
git (1.
|
4
|
+
diff-lcs (1.3)
|
5
|
+
git (1.3.0)
|
6
6
|
jeweler (1.8.4)
|
7
7
|
bundler (~> 1.0)
|
8
8
|
git (>= 1.2.5)
|
9
9
|
rake
|
10
10
|
rdoc
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
rspec-
|
17
|
-
|
18
|
-
rspec-
|
19
|
-
rspec-
|
20
|
-
|
21
|
-
|
22
|
-
rspec-mocks (
|
11
|
+
rake (12.0.0)
|
12
|
+
rdoc (5.0.0)
|
13
|
+
rspec (3.5.0)
|
14
|
+
rspec-core (~> 3.5.0)
|
15
|
+
rspec-expectations (~> 3.5.0)
|
16
|
+
rspec-mocks (~> 3.5.0)
|
17
|
+
rspec-core (3.5.4)
|
18
|
+
rspec-support (~> 3.5.0)
|
19
|
+
rspec-expectations (3.5.0)
|
20
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
21
|
+
rspec-support (~> 3.5.0)
|
22
|
+
rspec-mocks (3.5.0)
|
23
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
24
|
+
rspec-support (~> 3.5.0)
|
25
|
+
rspec-support (3.5.0)
|
23
26
|
|
24
27
|
PLATFORMS
|
25
28
|
ruby
|
26
29
|
|
27
30
|
DEPENDENCIES
|
28
31
|
jeweler (= 1.8.4)
|
29
|
-
rspec
|
32
|
+
rspec
|
33
|
+
|
34
|
+
BUNDLED WITH
|
35
|
+
1.13.6
|
data/README.md
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
Stopwords Filter
|
2
2
|
================
|
3
3
|
|
4
|
+
[![Build Status](https://travis-ci.org/brenes/stopwords-filter.svg?branch=master)](https://travis-ci.org/brenes/stopwords-filter)
|
5
|
+
|
4
6
|
This project is a very simple and naive implementation of a stopwords filter that remove a list of banned words (stopwords) from a sentence.
|
5
7
|
|
6
8
|
Quick guide
|
@@ -17,7 +19,8 @@ gem install stopwords-filter
|
|
17
19
|
or
|
18
20
|
|
19
21
|
```
|
20
|
-
|
22
|
+
# Don't forget the 'require:'
|
23
|
+
gem 'stopwords-filter', require: 'stopwords'
|
21
24
|
```
|
22
25
|
|
23
26
|
in your Gemfile.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.5.0
|
data/lib/stopwords/filter.rb
CHANGED
@@ -1,17 +1,21 @@
|
|
1
|
-
|
1
|
+
module Stopwords
|
2
2
|
|
3
|
-
|
3
|
+
class Filter
|
4
4
|
|
5
|
-
|
6
|
-
@stopwords = stopwords
|
7
|
-
end
|
5
|
+
attr_reader :stopwords
|
8
6
|
|
9
|
-
|
10
|
-
|
11
|
-
|
7
|
+
def initialize stopwords
|
8
|
+
@stopwords = stopwords
|
9
|
+
end
|
10
|
+
|
11
|
+
def filter words
|
12
|
+
words - @stopwords
|
13
|
+
end
|
14
|
+
|
15
|
+
def stopword? word
|
16
|
+
stopwords.include? word
|
17
|
+
end
|
12
18
|
|
13
|
-
def stopword? word
|
14
|
-
stopwords.include? word
|
15
19
|
end
|
16
20
|
|
17
21
|
end
|
@@ -1,19 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
1
|
+
module Stopwords
|
2
|
+
module Snowball
|
3
|
+
class Filter < Stopwords::Filter
|
4
|
+
attr_reader :locale
|
5
|
+
attr_reader :locale_filename
|
6
|
+
|
7
|
+
def initialize locale, custom_list = []
|
8
|
+
@locale = locale
|
9
|
+
@locale_filename = "#{File.dirname(__FILE__)}/locales/#{locale}.csv"
|
10
|
+
|
11
|
+
raise ArgumentError.new("Unknown locale: #{locale.inspect}") unless File.exists?(@locale_filename)
|
12
|
+
super File.read(@locale_filename).split(",") + custom_list
|
13
|
+
end
|
14
|
+
end
|
16
15
|
end
|
17
|
-
|
18
|
-
|
19
|
-
end
|
16
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
ο,η,το,οι,τα,του,τησ,των,τον,την,και,κι,κ,ειμαι,εισαι,ειναι,ειμαστε,ειστε,στο,στον,στη,στην,μα,αλλα,απο,για,προσ,με,σε,ωσ,παρα,αντι,κατα,μετα,θα,να,δε,δεν,μη,μην,επι,ενω,εαν,αν,τοτε,που,πωσ,ποιοσ,ποια,ποιο,ποιοι,ποιεσ,ποιων,ποιουσ,αυτοσ,αυτη,αυτο,αυτοι,αυτων,αυτουσ,αυτεσ,αυτα,εκεινοσ,εκεινη,εκεινο,εκεινοι,εκεινεσ,εκεινα,εκεινων,εκεινουσ,οπωσ,ομωσ,ισωσ,οσο,οτι
|
@@ -0,0 +1 @@
|
|
1
|
+
a,aby,ach,acz,aczkolwiek,aj,albo,ale,ależ,ani,aż,bardziej,bardzo,bo,bowiem,by,byli,bynajmniej,być,był,była,było,były,będzie,będą,cali,cała,cały,ci,cię,ciebie,co,cokolwiek,coś,czasami,czasem,czemu,czy,czyli,daleko,dla,dlaczego,dlatego,do,dobrze,dokąd,dość,dużo,dwa,dwaj,dwie,dwoje,dziś,dzisiaj,gdy,gdyby,gdyż,gdzie,gdziekolwiek,gdzieś,i,ich,ile,im,inna,inne,inny,innych,iż,ja,ją,jak,jakaś,jakby,jaki,jakichś,jakie,jakiś,jakiż,jakkolwiek,jako,jakoś,je,jeden,jedna,jedno,jednak,jednakże,jego,jej,jemu,jest,jestem,jeszcze,jeśli,jeżeli,już,ją,każdy,kiedy,kilka,kimś,kto,ktokolwiek,ktoś,która,które,którego,której,który,których,którym,którzy,ku,lat,lecz,lub,ma,mają,mało,mam,mi,mimo,między,mną,mnie,mogą,moi,moim,moja,moje,może,możliwe,można,mój,mu,musi,my,na,nad,nam,nami,nas,nasi,nasz,nasza,nasze,naszego,naszych,natomiast,natychmiast,nawet,nią,nic,nich,nie,niech,niego,niej,niemu,nigdy,nim,nimi,niż,no,o,obok,od,około,on,ona,one,oni,ono,oraz,oto,owszem,pan,pana,pani,po,pod,podczas,pomimo,ponad,ponieważ,powinien,powinna,powinni,powinno,poza,prawie,przecież,przed,przede,przedtem,przez,przy,roku,również,sama,są,się,skąd,sobie,sobą,sposób,swoje,ta,tak,taka,taki,takie,także,tam,te,tego,tej,temu,ten,teraz,też,to,tobą,tobie,toteż,trzeba,tu,tutaj,twoi,twoim,twoja,twoje,twym,twój,ty,tych,tylko,tym,u,w,wam,wami,was,wasz,wasza,wasze,we,według,wiele,wielu,więc,więcej,wszyscy,wszystkich,wszystkie,wszystkim,wszystko,wtedy,wy,właśnie,z,za,zapewne,zawsze,ze,zł,znowu,znów,został,żaden,żadna,żadne,żadnych,że,żeby
|
@@ -0,0 +1 @@
|
|
1
|
+
acea,aceasta,această,aceea,acei,aceia,acel,acela,acele,acelea,acest,acesta,aceste,acestea,aceşti,aceştia,acolo,acord,acum,ai,aia,aibă,aici,al,ăla,ale,alea,ălea,altceva,altcineva,am,ar,are,aş,aşadar,asemenea,asta,ăsta,astăzi,astea,ăstea,ăştia,asupra,aţi,au,avea,avem,aveţi,azi,bine,bucur,bună,ca,că,căci,când,care,cărei,căror,cărui,cât,câte,câţi,către,câtva,caut,ce,cel,ceva,chiar,cinci,cînd,cine,cineva,cît,cîte,cîţi,cîtva,contra,cu,cum,cumva,curând,curînd,da,dă,dacă,dar,dată,datorită,dau,de,deci,deja,deoarece,departe,deşi,din,dinaintea,dintr-,dintre,doi,doilea,două,drept,după,ea,ei,el,ele,eram,este,eşti,eu,face,fără,fata,fi,fie,fiecare,fii,fim,fiţi,fiu,frumos,graţie,halbă,iar,ieri,îi,îl,îmi,împotriva,în ,înainte,înaintea,încât,încît,încotro,între,întrucât,întrucît,îţi,la,lângă,le,li,lîngă,lor,lui,mă,mai,mâine,mea,mei,mele,mereu,meu,mi,mie,mîine,mine,mult,multă,mulţi,mulţumesc,ne,nevoie,nicăieri,nici,nimeni,nimeri,nimic,nişte,noastră,noastre,noi,noroc,noştri,nostru,nouă,nu,opt,ori,oricând,oricare,oricât,orice,oricînd,oricine,oricît,oricum,oriunde,până,patra,patru,patrulea,pe,pentru,peste,pic,pînă,poate,pot,prea,prima,primul,prin,puţin,puţina,puţină,rog,sa,să,săi,sale,şapte,şase,sau,său,se,şi,sînt,sîntem,sînteţi,spate,spre,ştiu,sub,sunt,suntem,sunteţi,sută,ta,tăi,tale,tău,te,ţi,ţie,timp,tine,toată,toate,tot,toţi,totuşi,trei,treia,treilea,tu,un,una,unde,undeva,unei,uneia,unele,uneori,unii,unor,unora,unu,unui,unuia,unul,vă,vi,voastră,voastre,voi,voştri,vostru,vouă,vreme,vreo,vreun,zece,zero,zi,zice
|
@@ -1,159 +1 @@
|
|
1
|
-
|
2
|
-
×
|
3
|
-
×Ï
|
4
|
-
ÎÅ
|
5
|
-
ÞÔÏ
|
6
|
-
ÏÎ
|
7
|
-
ÎÁ
|
8
|
-
Ñ
|
9
|
-
Ó
|
10
|
-
ÓÏ
|
11
|
-
ËÁË
|
12
|
-
Á
|
13
|
-
ÔÏ
|
14
|
-
×ÓÅ
|
15
|
-
ÏÎÁ
|
16
|
-
ÔÁË
|
17
|
-
ÅÇÏ
|
18
|
-
ÎÏ
|
19
|
-
ÄÁ
|
20
|
-
ÔÙ
|
21
|
-
Ë
|
22
|
-
Õ
|
23
|
-
ÖÅ
|
24
|
-
×Ù
|
25
|
-
ÚÁ
|
26
|
-
ÂÙ
|
27
|
-
ÐÏ
|
28
|
-
ÔÏÌØËÏ
|
29
|
-
ÅÅ
|
30
|
-
ÍÎÅ
|
31
|
-
ÂÙÌÏ
|
32
|
-
×ÏÔ
|
33
|
-
ÏÔ
|
34
|
-
ÍÅÎÑ
|
35
|
-
ÅÝÅ
|
36
|
-
ÎÅÔ
|
37
|
-
Ï
|
38
|
-
ÉÚ
|
39
|
-
ÅÍÕ
|
40
|
-
ÔÅÐÅÒØ
|
41
|
-
ËÏÇÄÁ
|
42
|
-
ÄÁÖÅ
|
43
|
-
ÎÕ
|
44
|
-
×ÄÒÕÇ
|
45
|
-
ÌÉ
|
46
|
-
ÅÓÌÉ
|
47
|
-
ÕÖÅ
|
48
|
-
ÉÌÉ
|
49
|
-
ÎÉ
|
50
|
-
ÂÙÔØ
|
51
|
-
ÂÙÌ
|
52
|
-
ÎÅÇÏ
|
53
|
-
ÄÏ
|
54
|
-
×ÁÓ
|
55
|
-
ÎÉÂÕÄØ
|
56
|
-
ÏÐÑÔØ
|
57
|
-
ÕÖ
|
58
|
-
×ÁÍ
|
59
|
-
ÓËÁÚÁÌ
|
60
|
-
×ÅÄØ
|
61
|
-
ÔÁÍ
|
62
|
-
ÐÏÔÏÍ
|
63
|
-
ÓÅÂÑ
|
64
|
-
ÎÉÞÅÇÏ
|
65
|
-
ÅÊ
|
66
|
-
ÍÏÖÅÔ
|
67
|
-
ÏÎÉ
|
68
|
-
ÔÕÔ
|
69
|
-
ÇÄÅ
|
70
|
-
ÅÓÔØ
|
71
|
-
ÎÁÄÏ
|
72
|
-
ÎÅÊ
|
73
|
-
ÄÌÑ
|
74
|
-
ÍÙ
|
75
|
-
ÔÅÂÑ
|
76
|
-
ÉÈ
|
77
|
-
ÞÅÍ
|
78
|
-
ÂÙÌÁ
|
79
|
-
ÓÁÍ
|
80
|
-
ÞÔÏÂ
|
81
|
-
ÂÅÚ
|
82
|
-
ÂÕÄÔÏ
|
83
|
-
ÞÅÌÏ×ÅË
|
84
|
-
ÞÅÇÏ
|
85
|
-
ÒÁÚ
|
86
|
-
ÔÏÖÅ
|
87
|
-
ÓÅÂÅ
|
88
|
-
ÐÏÄ
|
89
|
-
ÖÉÚÎØ
|
90
|
-
ÂÕÄÅÔ
|
91
|
-
Ö
|
92
|
-
ÔÏÇÄÁ
|
93
|
-
ËÔÏ
|
94
|
-
ÜÔÏÔ
|
95
|
-
ÇÏ×ÏÒÉÌ
|
96
|
-
ÔÏÇÏ
|
97
|
-
ÐÏÔÏÍÕ
|
98
|
-
ÜÔÏÇÏ
|
99
|
-
ËÁËÏÊ
|
100
|
-
ÓÏ×ÓÅÍ
|
101
|
-
ÎÉÍ
|
102
|
-
ÚÄÅÓØ
|
103
|
-
ÜÔÏÍ
|
104
|
-
ÏÄÉÎ
|
105
|
-
ÐÏÞÔÉ
|
106
|
-
ÍÏÊ
|
107
|
-
ÔÅÍ
|
108
|
-
ÞÔÏÂÙ
|
109
|
-
ÎÅÅ
|
110
|
-
ËÁÖÅÔÓÑ
|
111
|
-
ÓÅÊÞÁÓ
|
112
|
-
ÂÙÌÉ
|
113
|
-
ËÕÄÁ
|
114
|
-
ÚÁÞÅÍ
|
115
|
-
ÓËÁÚÁÔØ
|
116
|
-
×ÓÅÈ
|
117
|
-
ÎÉËÏÇÄÁ
|
118
|
-
ÓÅÇÏÄÎÑ
|
119
|
-
ÍÏÖÎÏ
|
120
|
-
ÐÒÉ
|
121
|
-
ÎÁËÏÎÅÃ
|
122
|
-
Ä×Á
|
123
|
-
ÏÂ
|
124
|
-
ÄÒÕÇÏÊ
|
125
|
-
ÈÏÔØ
|
126
|
-
ÐÏÓÌÅ
|
127
|
-
ÎÁÄ
|
128
|
-
ÂÏÌØÛÅ
|
129
|
-
ÔÏÔ
|
130
|
-
ÞÅÒÅÚ
|
131
|
-
ÜÔÉ
|
132
|
-
ÎÁÓ
|
133
|
-
ÐÒÏ
|
134
|
-
×ÓÅÇÏ
|
135
|
-
ÎÉÈ
|
136
|
-
ËÁËÁÑ
|
137
|
-
ÍÎÏÇÏ
|
138
|
-
ÒÁÚ×Å
|
139
|
-
ÓËÁÚÁÌÁ
|
140
|
-
ÔÒÉ
|
141
|
-
ÜÔÕ
|
142
|
-
ÍÏÑ
|
143
|
-
×ÐÒÏÞÅÍ
|
144
|
-
ÈÏÒÏÛÏ
|
145
|
-
Ó×ÏÀ
|
146
|
-
ÜÔÏÊ
|
147
|
-
ÐÅÒÅÄ
|
148
|
-
ÉÎÏÇÄÁ
|
149
|
-
ÌÕÞÛÅ
|
150
|
-
ÞÕÔØ
|
151
|
-
ÔÏÍ
|
152
|
-
ÎÅÌØÚÑ
|
153
|
-
ÔÁËÏÊ
|
154
|
-
ÉÍ
|
155
|
-
ÂÏÌÅÅ
|
156
|
-
×ÓÅÇÄÁ
|
157
|
-
ËÏÎÅÞÎÏ
|
158
|
-
×ÓÀ
|
159
|
-
ÍÅÖÄÕ
|
1
|
+
и,в,во,не,что,он,на,я,с,со,как,а,то,все,она,так,его,но,да,ты,к,у,же,вы,за,бы,по,только,ее,мне,было,вот,от,меня,еще,нет,о,из,ему,теперь,когда,даже,ну,вдруг,ли,если,уже,или,ни,быть,был,него,до,вас,нибудь,опять,уж,вам,сказал,ведь,там,потом,себя,ничего,ей,может,они,тут,где,есть,надо,ней,для,мы,тебя,их,чем,была,сам,чтоб,без,будто,человек,чего,раз,тоже,себе,под,жизнь,будет,ж,тогда,кто,этот,говорил,того,потому,этого,какой,совсем,ним,здесь,этом,один,почти,мой,тем,чтобы,нее,кажется,сейчас,были,куда,зачем,сказать,всех,никогда,сегодня,можно,при,наконец,два,об,другой,хоть,после,над,больше,тот,через,эти,нас,про,всего,них,какая,много,разве,сказала,три,эту,моя,впрочем,хорошо,свою,этой,перед,иногда,лучше,чуть,том,нельзя,такой,им,более,всегда,конечно,всю,между
|
@@ -1,114 +1 @@
|
|
1
|
-
och
|
2
|
-
det
|
3
|
-
att
|
4
|
-
i
|
5
|
-
en
|
6
|
-
jag
|
7
|
-
hon
|
8
|
-
som
|
9
|
-
han
|
10
|
-
på
|
11
|
-
den
|
12
|
-
med
|
13
|
-
var
|
14
|
-
sig
|
15
|
-
för
|
16
|
-
så
|
17
|
-
till
|
18
|
-
är
|
19
|
-
men
|
20
|
-
ett
|
21
|
-
om
|
22
|
-
hade
|
23
|
-
de
|
24
|
-
av
|
25
|
-
icke
|
26
|
-
mig
|
27
|
-
du
|
28
|
-
henne
|
29
|
-
då
|
30
|
-
sin
|
31
|
-
nu
|
32
|
-
har
|
33
|
-
inte
|
34
|
-
hans
|
35
|
-
honom
|
36
|
-
skulle
|
37
|
-
hennes
|
38
|
-
där
|
39
|
-
min
|
40
|
-
man
|
41
|
-
ej
|
42
|
-
vid
|
43
|
-
kunde
|
44
|
-
något
|
45
|
-
från
|
46
|
-
ut
|
47
|
-
när
|
48
|
-
efter
|
49
|
-
upp
|
50
|
-
vi
|
51
|
-
dem
|
52
|
-
vara
|
53
|
-
vad
|
54
|
-
över
|
55
|
-
än
|
56
|
-
dig
|
57
|
-
kan
|
58
|
-
sina
|
59
|
-
här
|
60
|
-
ha
|
61
|
-
mot
|
62
|
-
alla
|
63
|
-
under
|
64
|
-
någon
|
65
|
-
eller
|
66
|
-
allt
|
67
|
-
mycket
|
68
|
-
sedan
|
69
|
-
ju
|
70
|
-
denna
|
71
|
-
själv
|
72
|
-
detta
|
73
|
-
åt
|
74
|
-
utan
|
75
|
-
varit
|
76
|
-
hur
|
77
|
-
ingen
|
78
|
-
mitt
|
79
|
-
ni
|
80
|
-
bli
|
81
|
-
blev
|
82
|
-
oss
|
83
|
-
din
|
84
|
-
dessa
|
85
|
-
några
|
86
|
-
deras
|
87
|
-
blir
|
88
|
-
mina
|
89
|
-
samma
|
90
|
-
vilken
|
91
|
-
er
|
92
|
-
sådan
|
93
|
-
vår
|
94
|
-
blivit
|
95
|
-
dess
|
96
|
-
inom
|
97
|
-
mellan
|
98
|
-
sådant
|
99
|
-
varför
|
100
|
-
varje
|
101
|
-
vilka
|
102
|
-
ditt
|
103
|
-
vem
|
104
|
-
vilket
|
105
|
-
sitta
|
106
|
-
sådana
|
107
|
-
vart
|
108
|
-
dina
|
109
|
-
vars
|
110
|
-
vårt
|
111
|
-
våra
|
112
|
-
ert
|
113
|
-
era
|
114
|
-
vilkas
|
1
|
+
och,det,att,i,en,jag,hon,som,han,på,den,med,var,sig,för,så,till,är,men,ett,om,hade,de,av,icke,mig,du,henne,då,sin,nu,har,inte,hans,honom,skulle,hennes,där,min,man,ej,vid,kunde,något,från,ut,när,efter,upp,vi,dem,vara,vad,över,än,dig,kan,sina,här,ha,mot,alla,under,någon,eller,allt,mycket,sedan,ju,denna,själv,detta,åt,utan,varit,hur,ingen,mitt,ni,bli,blev,oss,din,dessa,några,deras,blir,mina,samma,vilken,er,sådan,vår,blivit,dess,inom,mellan,sådant,varför,varje,vilka,ditt,vem,vilket,sitta,sådana,vart,dina,vars,vårt,våra,ert,era,vilkas,
|
@@ -1,16 +1,20 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
1
|
+
module Stopwords
|
2
|
+
module Snowball
|
3
|
+
class Stopwords::Snowball::WordSieve
|
4
|
+
def initialize custom_list = []
|
5
|
+
@filters = Dir[File.dirname(__FILE__) + '/locales/*.csv'].each_with_object({}) do |file, filters|
|
6
|
+
lang = File.basename(file, '.csv').to_sym
|
7
|
+
filters[lang] = Stopwords::Snowball::Filter.new lang, custom_list
|
8
|
+
end
|
9
|
+
end
|
8
10
|
|
9
|
-
|
10
|
-
|
11
|
-
|
11
|
+
def stopword? args={}
|
12
|
+
args[:lang] ? @filters[args[:lang]].stopword?(args[:word] ) : false
|
13
|
+
end
|
12
14
|
|
13
|
-
|
14
|
-
|
15
|
+
def filter args={}
|
16
|
+
args[:lang] ? @filters[args[:lang]].filter(args[:words] ) : args[:words]
|
17
|
+
end
|
18
|
+
end
|
15
19
|
end
|
16
|
-
end
|
20
|
+
end
|
data/spec/lib/filter_spec.rb
CHANGED
@@ -9,9 +9,9 @@ describe Stopwords::Filter do
|
|
9
9
|
|
10
10
|
subject { filter }
|
11
11
|
|
12
|
-
it("should remove the stopwords for the list of words to be filtered") { filter.filter("desde Santurce a Bilbao".split).
|
12
|
+
it("should remove the stopwords for the list of words to be filtered") { expect(filter.filter("desde Santurce a Bilbao".split)).to eq ["Santurce", "Bilbao"]}
|
13
13
|
|
14
14
|
end
|
15
15
|
|
16
16
|
|
17
|
-
end
|
17
|
+
end
|
@@ -9,11 +9,32 @@ describe Stopwords::Snowball::Filter do
|
|
9
9
|
|
10
10
|
subject { filter }
|
11
11
|
|
12
|
-
|
12
|
+
it("should have the appropiate stopwords") { expect(subject.stopwords).to eq ["de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por", "un", "para", "con", "no", "una", "su", "al", "lo", "como", "más", "pero", "sus", "le", "ya", "o", "este", "sí", "porque", "esta", "entre", "cuando", "muy", "sin", "sobre", "también", "me", "hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante", "todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante", "ellos", "e", "esto", "mí", "antes", "algunos", "qué", "unos", "yo", "otro", "otras", "otra", "él", "tanto", "esa", "estos", "mucho", "quienes", "nada", "muchos", "cual", "poco", "ella", "estar", "estas", "algunas", "algo", "nosotros", "mi", "mis", "tú", "te", "ti", "tu", "tus", "ellas", "nosotras", "vosotros", "vosotras", "os", "mío", "mía", "míos", "mías", "tuyo", "tuya", "tuyos", "tuyas", "suyo", "suya", "suyos", "suyas", "nuestro", "nuestra", "nuestros", "nuestras", "vuestro", "vuestra", "vuestros", "vuestras", "esos", "esas", "estoy", "estás", "está", "estamos", "estáis", "están", "esté", "estés", "estemos", "estéis", "estén", "estaré", "estarás", "estará", "estaremos", "estaréis", "estarán", "estaría", "estarías", "estaríamos", "estaríais", "estarían", "estaba", "estabas", "estábamos", "estabais", "estaban", "estuve", "estuviste", "estuvo", "estuvimos", "estuvisteis", "estuvieron", "estuviera", "estuvieras", "estuviéramos", "estuvierais", "estuvieran", "estuviese", "estuvieses", "estuviésemos", "estuvieseis", "estuviesen", "estando", "estado", "estada", "estados", "estadas", "estad", "he", "has", "ha", "hemos", "habéis", "han", "haya", "hayas", "hayamos", "hayáis", "hayan", "habré", "habrás", "habrá", "habremos", "habréis", "habrán", "habría", "habrías", "habríamos", "habríais", "habrían", "había", "habías", "habíamos", "habíais", "habían", "hube", "hubiste", "hubo", "hubimos", "hubisteis", "hubieron", "hubiera", "hubieras", "hubiéramos", "hubierais", "hubieran", "hubiese", "hubieses", "hubiésemos", "hubieseis", "hubiesen", "habiendo", "habido", "habida", "habidos", "habidas", "soy", "eres", "es", "somos", "sois", "son", "sea", "seas", "seamos", "seáis", "sean", "seré", "serás", "será", "seremos", "seréis", "serán", "sería", "serías", "seríamos", "seríais", "serían", "era", "eras", "éramos", "erais", "eran", "fui", "fuiste", "fue", "fuimos", "fuisteis", "fueron", "fuera", "fueras", "fuéramos", "fuerais", "fueran", "fuese", "fueses", "fuésemos", "fueseis", "fuesen", "siendo", "sido", "tengo", "tienes", "tiene", "tenemos", "tenéis", "tienen", "tenga", "tengas", "tengamos", "tengáis", "tengan", "tendré", "tendrás", "tendrá", "tendremos", "tendréis", "tendrán", "tendría", "tendrías", "tendríamos", "tendríais", "tendrían", "tenía", "tenías", "teníamos", "teníais", "tenían", "tuve", "tuviste", "tuvo", "tuvimos", "tuvisteis", "tuvieron", "tuviera", "tuvieras", "tuviéramos", "tuvierais", "tuvieran", "tuviese", "tuvieses", "tuviésemos", "tuvieseis", "tuviesen", "teniendo", "tenido", "tenida", "tenidos", "tenidas", "tened"]}
|
13
13
|
|
14
|
-
it("should remove the stopwords for the list of words to be filtered") { filter.filter("desde Santurce a Bilbao".split).
|
14
|
+
it("should remove the stopwords for the list of words to be filtered") { expect(filter.filter("desde Santurce a Bilbao".split)).to eq ["Santurce", "Bilbao"]}
|
15
15
|
|
16
16
|
end
|
17
17
|
|
18
|
+
context "when custom list" do
|
18
19
|
|
19
|
-
|
20
|
+
let (:filter) { Stopwords::Snowball::Filter.new "es", ["Santurce"] }
|
21
|
+
|
22
|
+
subject { filter }
|
23
|
+
|
24
|
+
it("should remove the stopwords for the list of words to be filtered") { expect(filter.filter("desde Santurce a Bilbao".split)).to eq ["Bilbao"]}
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
context "when locale is not found" do
|
29
|
+
|
30
|
+
let (:filter) { Stopwords::Snowball::Filter.new "no-real" }
|
31
|
+
|
32
|
+
subject { filter }
|
33
|
+
|
34
|
+
it("should throw an error explaining the locale not found") { expect{filter}.to raise_error(ArgumentError)}
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
|
40
|
+
end
|
metadata
CHANGED
@@ -1,42 +1,39 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: stopwords-filter
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
version: 0.2.1
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.5.0
|
6
5
|
platform: ruby
|
7
|
-
authors:
|
6
|
+
authors:
|
8
7
|
- David J. Brenes
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
|
13
|
-
date: 2013-02-03 00:00:00 Z
|
11
|
+
date: 2021-06-09 00:00:00.000000000 Z
|
14
12
|
dependencies: []
|
15
|
-
|
16
|
-
|
13
|
+
description: Small library that allows you to create a simple stopwords filter or
|
14
|
+
use some based on Snowball stopwords lists
|
17
15
|
email: davidjbrenes@gmail.com
|
18
16
|
executables: []
|
19
|
-
|
20
17
|
extensions: []
|
21
|
-
|
22
|
-
extra_rdoc_files:
|
18
|
+
extra_rdoc_files:
|
23
19
|
- CHANGELOG
|
24
20
|
- LICENSE.txt
|
25
21
|
- README.md
|
26
|
-
files:
|
22
|
+
files:
|
27
23
|
- CHANGELOG
|
28
24
|
- Gemfile
|
29
25
|
- Gemfile.lock
|
26
|
+
- LICENSE.txt
|
30
27
|
- README.md
|
31
28
|
- VERSION
|
32
29
|
- lib/stopwords.rb
|
33
30
|
- lib/stopwords/filter.rb
|
34
31
|
- lib/stopwords/snowball.rb
|
35
32
|
- lib/stopwords/snowball/filter.rb
|
36
|
-
- lib/stopwords/snowball/wordsieve.rb
|
37
33
|
- lib/stopwords/snowball/locales/bg.csv
|
38
34
|
- lib/stopwords/snowball/locales/da.csv
|
39
35
|
- lib/stopwords/snowball/locales/de.csv
|
36
|
+
- lib/stopwords/snowball/locales/el.csv
|
40
37
|
- lib/stopwords/snowball/locales/en.csv
|
41
38
|
- lib/stopwords/snowball/locales/es.csv
|
42
39
|
- lib/stopwords/snowball/locales/fn.csv
|
@@ -44,39 +41,37 @@ files:
|
|
44
41
|
- lib/stopwords/snowball/locales/hu.csv
|
45
42
|
- lib/stopwords/snowball/locales/it.csv
|
46
43
|
- lib/stopwords/snowball/locales/nl.csv
|
44
|
+
- lib/stopwords/snowball/locales/pl.csv
|
47
45
|
- lib/stopwords/snowball/locales/pt.csv
|
46
|
+
- lib/stopwords/snowball/locales/ro.csv
|
48
47
|
- lib/stopwords/snowball/locales/ru.csv
|
49
48
|
- lib/stopwords/snowball/locales/sv.csv
|
49
|
+
- lib/stopwords/snowball/wordsieve.rb
|
50
50
|
- spec/lib/filter_spec.rb
|
51
51
|
- spec/lib/snowball_filter_spec.rb
|
52
52
|
- spec/spec_helper.rb
|
53
|
-
- LICENSE.txt
|
54
53
|
homepage: http://github.com/brenes/stopwords-filter
|
55
|
-
licenses:
|
54
|
+
licenses:
|
56
55
|
- MIT
|
56
|
+
metadata: {}
|
57
57
|
post_install_message:
|
58
58
|
rdoc_options: []
|
59
|
-
|
60
|
-
require_paths:
|
59
|
+
require_paths:
|
61
60
|
- lib
|
62
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
-
|
64
|
-
requirements:
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
65
63
|
- - ">="
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
version:
|
68
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
69
|
-
|
70
|
-
requirements:
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
71
68
|
- - ">="
|
72
|
-
- !ruby/object:Gem::Version
|
73
|
-
version:
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
74
71
|
requirements: []
|
75
|
-
|
76
72
|
rubyforge_project:
|
77
|
-
rubygems_version:
|
73
|
+
rubygems_version: 2.5.1
|
78
74
|
signing_key:
|
79
|
-
specification_version:
|
75
|
+
specification_version: 4
|
80
76
|
summary: Snowball based filters for stopwords
|
81
77
|
test_files: []
|
82
|
-
|