stopwords-filter 0.2.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG +12 -0
- data/Gemfile +1 -1
- data/Gemfile.lock +21 -15
- data/README.md +4 -1
- data/VERSION +1 -1
- data/lib/stopwords/filter.rb +14 -10
- data/lib/stopwords/snowball/filter.rb +15 -18
- data/lib/stopwords/snowball/locales/el.csv +1 -0
- data/lib/stopwords/snowball/locales/pl.csv +1 -0
- data/lib/stopwords/snowball/locales/ro.csv +1 -0
- data/lib/stopwords/snowball/locales/ru.csv +1 -159
- data/lib/stopwords/snowball/locales/sv.csv +1 -114
- data/lib/stopwords/snowball/wordsieve.rb +17 -13
- data/spec/lib/filter_spec.rb +2 -2
- data/spec/lib/snowball_filter_spec.rb +24 -3
- metadata +27 -32
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ab6ffb4c393600755245462bd24815191b9a589a
|
4
|
+
data.tar.gz: 0a23aeeb9caaf8c65341a7bb7f2e8760fe8df9a9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f6d2c6f8524c2fceb16fa069fa95458e6346ebc0a0cd8c10746d018a4f0339b05bb91511e0edc6ca87b5f55e84d689f78fcdba174ef1c7e2d60926d2e29704ae
|
7
|
+
data.tar.gz: 3b51a3de282c95b86d1d0720d885d375cd21976af16ae987ffab49e0e0b9d380597f0170a5d76949f4529d96db63e6ba6d85a24f2278bbfb9b7b8d8aae83faf1
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
0.5.0:
|
2
|
+
* Added Greek stopwords based on Lucene [@vrypan] [#13]
|
3
|
+
* Fixed CSV format for sv and ru locales [@woto] [#14]
|
4
|
+
* Fixed romanian and polish CSV not being included in the gemspec.
|
5
|
+
0.4.1:
|
6
|
+
* Dummy version to fix metadata in gemspec
|
7
|
+
0.4.0:
|
8
|
+
* Added Polish stopwords based on https://pl.wikipedia.org/wiki/Wikipedia:Stopwords [@grzegorzblaszczyk] [#9]
|
9
|
+
* Added info when no locale is found on snowball filter [@nerde] [#10]
|
10
|
+
0.3.0:
|
11
|
+
* Added custom words list to snowball filter [@sbeckeriv]
|
12
|
+
* Fixed problem about initialization: https://github.com/brenes/stopwords-filter/issues/3 [@zackxu1]
|
1
13
|
0.2.0:
|
2
14
|
* Added stopword? method [@s2gatev]
|
3
15
|
* Added Sieve class [@s2gatev]
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,29 +1,35 @@
|
|
1
1
|
GEM
|
2
2
|
remote: https://rubygems.org/
|
3
3
|
specs:
|
4
|
-
diff-lcs (1.
|
5
|
-
git (1.
|
4
|
+
diff-lcs (1.3)
|
5
|
+
git (1.3.0)
|
6
6
|
jeweler (1.8.4)
|
7
7
|
bundler (~> 1.0)
|
8
8
|
git (>= 1.2.5)
|
9
9
|
rake
|
10
10
|
rdoc
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
rspec-
|
17
|
-
|
18
|
-
rspec-
|
19
|
-
rspec-
|
20
|
-
|
21
|
-
|
22
|
-
rspec-mocks (
|
11
|
+
rake (12.0.0)
|
12
|
+
rdoc (5.0.0)
|
13
|
+
rspec (3.5.0)
|
14
|
+
rspec-core (~> 3.5.0)
|
15
|
+
rspec-expectations (~> 3.5.0)
|
16
|
+
rspec-mocks (~> 3.5.0)
|
17
|
+
rspec-core (3.5.4)
|
18
|
+
rspec-support (~> 3.5.0)
|
19
|
+
rspec-expectations (3.5.0)
|
20
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
21
|
+
rspec-support (~> 3.5.0)
|
22
|
+
rspec-mocks (3.5.0)
|
23
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
24
|
+
rspec-support (~> 3.5.0)
|
25
|
+
rspec-support (3.5.0)
|
23
26
|
|
24
27
|
PLATFORMS
|
25
28
|
ruby
|
26
29
|
|
27
30
|
DEPENDENCIES
|
28
31
|
jeweler (= 1.8.4)
|
29
|
-
rspec
|
32
|
+
rspec
|
33
|
+
|
34
|
+
BUNDLED WITH
|
35
|
+
1.13.6
|
data/README.md
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
Stopwords Filter
|
2
2
|
================
|
3
3
|
|
4
|
+
[](https://travis-ci.org/brenes/stopwords-filter)
|
5
|
+
|
4
6
|
This project is a very simple and naive implementation of a stopwords filter that remove a list of banned words (stopwords) from a sentence.
|
5
7
|
|
6
8
|
Quick guide
|
@@ -17,7 +19,8 @@ gem install stopwords-filter
|
|
17
19
|
or
|
18
20
|
|
19
21
|
```
|
20
|
-
|
22
|
+
# Don't forget the 'require:'
|
23
|
+
gem 'stopwords-filter', require: 'stopwords'
|
21
24
|
```
|
22
25
|
|
23
26
|
in your Gemfile.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.5.0
|
data/lib/stopwords/filter.rb
CHANGED
@@ -1,17 +1,21 @@
|
|
1
|
-
|
1
|
+
module Stopwords
|
2
2
|
|
3
|
-
|
3
|
+
class Filter
|
4
4
|
|
5
|
-
|
6
|
-
@stopwords = stopwords
|
7
|
-
end
|
5
|
+
attr_reader :stopwords
|
8
6
|
|
9
|
-
|
10
|
-
|
11
|
-
|
7
|
+
def initialize stopwords
|
8
|
+
@stopwords = stopwords
|
9
|
+
end
|
10
|
+
|
11
|
+
def filter words
|
12
|
+
words - @stopwords
|
13
|
+
end
|
14
|
+
|
15
|
+
def stopword? word
|
16
|
+
stopwords.include? word
|
17
|
+
end
|
12
18
|
|
13
|
-
def stopword? word
|
14
|
-
stopwords.include? word
|
15
19
|
end
|
16
20
|
|
17
21
|
end
|
@@ -1,19 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
1
|
+
module Stopwords
|
2
|
+
module Snowball
|
3
|
+
class Filter < Stopwords::Filter
|
4
|
+
attr_reader :locale
|
5
|
+
attr_reader :locale_filename
|
6
|
+
|
7
|
+
def initialize locale, custom_list = []
|
8
|
+
@locale = locale
|
9
|
+
@locale_filename = "#{File.dirname(__FILE__)}/locales/#{locale}.csv"
|
10
|
+
|
11
|
+
raise ArgumentError.new("Unknown locale: #{locale.inspect}") unless File.exists?(@locale_filename)
|
12
|
+
super File.read(@locale_filename).split(",") + custom_list
|
13
|
+
end
|
14
|
+
end
|
16
15
|
end
|
17
|
-
|
18
|
-
|
19
|
-
end
|
16
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
ο,η,το,οι,τα,του,τησ,των,τον,την,και,κι,κ,ειμαι,εισαι,ειναι,ειμαστε,ειστε,στο,στον,στη,στην,μα,αλλα,απο,για,προσ,με,σε,ωσ,παρα,αντι,κατα,μετα,θα,να,δε,δεν,μη,μην,επι,ενω,εαν,αν,τοτε,που,πωσ,ποιοσ,ποια,ποιο,ποιοι,ποιεσ,ποιων,ποιουσ,αυτοσ,αυτη,αυτο,αυτοι,αυτων,αυτουσ,αυτεσ,αυτα,εκεινοσ,εκεινη,εκεινο,εκεινοι,εκεινεσ,εκεινα,εκεινων,εκεινουσ,οπωσ,ομωσ,ισωσ,οσο,οτι
|
@@ -0,0 +1 @@
|
|
1
|
+
a,aby,ach,acz,aczkolwiek,aj,albo,ale,ależ,ani,aż,bardziej,bardzo,bo,bowiem,by,byli,bynajmniej,być,był,była,było,były,będzie,będą,cali,cała,cały,ci,cię,ciebie,co,cokolwiek,coś,czasami,czasem,czemu,czy,czyli,daleko,dla,dlaczego,dlatego,do,dobrze,dokąd,dość,dużo,dwa,dwaj,dwie,dwoje,dziś,dzisiaj,gdy,gdyby,gdyż,gdzie,gdziekolwiek,gdzieś,i,ich,ile,im,inna,inne,inny,innych,iż,ja,ją,jak,jakaś,jakby,jaki,jakichś,jakie,jakiś,jakiż,jakkolwiek,jako,jakoś,je,jeden,jedna,jedno,jednak,jednakże,jego,jej,jemu,jest,jestem,jeszcze,jeśli,jeżeli,już,ją,każdy,kiedy,kilka,kimś,kto,ktokolwiek,ktoś,która,które,którego,której,który,których,którym,którzy,ku,lat,lecz,lub,ma,mają,mało,mam,mi,mimo,między,mną,mnie,mogą,moi,moim,moja,moje,może,możliwe,można,mój,mu,musi,my,na,nad,nam,nami,nas,nasi,nasz,nasza,nasze,naszego,naszych,natomiast,natychmiast,nawet,nią,nic,nich,nie,niech,niego,niej,niemu,nigdy,nim,nimi,niż,no,o,obok,od,około,on,ona,one,oni,ono,oraz,oto,owszem,pan,pana,pani,po,pod,podczas,pomimo,ponad,ponieważ,powinien,powinna,powinni,powinno,poza,prawie,przecież,przed,przede,przedtem,przez,przy,roku,również,sama,są,się,skąd,sobie,sobą,sposób,swoje,ta,tak,taka,taki,takie,także,tam,te,tego,tej,temu,ten,teraz,też,to,tobą,tobie,toteż,trzeba,tu,tutaj,twoi,twoim,twoja,twoje,twym,twój,ty,tych,tylko,tym,u,w,wam,wami,was,wasz,wasza,wasze,we,według,wiele,wielu,więc,więcej,wszyscy,wszystkich,wszystkie,wszystkim,wszystko,wtedy,wy,właśnie,z,za,zapewne,zawsze,ze,zł,znowu,znów,został,żaden,żadna,żadne,żadnych,że,żeby
|
@@ -0,0 +1 @@
|
|
1
|
+
acea,aceasta,această,aceea,acei,aceia,acel,acela,acele,acelea,acest,acesta,aceste,acestea,aceşti,aceştia,acolo,acord,acum,ai,aia,aibă,aici,al,ăla,ale,alea,ălea,altceva,altcineva,am,ar,are,aş,aşadar,asemenea,asta,ăsta,astăzi,astea,ăstea,ăştia,asupra,aţi,au,avea,avem,aveţi,azi,bine,bucur,bună,ca,că,căci,când,care,cărei,căror,cărui,cât,câte,câţi,către,câtva,caut,ce,cel,ceva,chiar,cinci,cînd,cine,cineva,cît,cîte,cîţi,cîtva,contra,cu,cum,cumva,curând,curînd,da,dă,dacă,dar,dată,datorită,dau,de,deci,deja,deoarece,departe,deşi,din,dinaintea,dintr-,dintre,doi,doilea,două,drept,după,ea,ei,el,ele,eram,este,eşti,eu,face,fără,fata,fi,fie,fiecare,fii,fim,fiţi,fiu,frumos,graţie,halbă,iar,ieri,îi,îl,îmi,împotriva,în ,înainte,înaintea,încât,încît,încotro,între,întrucât,întrucît,îţi,la,lângă,le,li,lîngă,lor,lui,mă,mai,mâine,mea,mei,mele,mereu,meu,mi,mie,mîine,mine,mult,multă,mulţi,mulţumesc,ne,nevoie,nicăieri,nici,nimeni,nimeri,nimic,nişte,noastră,noastre,noi,noroc,noştri,nostru,nouă,nu,opt,ori,oricând,oricare,oricât,orice,oricînd,oricine,oricît,oricum,oriunde,până,patra,patru,patrulea,pe,pentru,peste,pic,pînă,poate,pot,prea,prima,primul,prin,puţin,puţina,puţină,rog,sa,să,săi,sale,şapte,şase,sau,său,se,şi,sînt,sîntem,sînteţi,spate,spre,ştiu,sub,sunt,suntem,sunteţi,sută,ta,tăi,tale,tău,te,ţi,ţie,timp,tine,toată,toate,tot,toţi,totuşi,trei,treia,treilea,tu,un,una,unde,undeva,unei,uneia,unele,uneori,unii,unor,unora,unu,unui,unuia,unul,vă,vi,voastră,voastre,voi,voştri,vostru,vouă,vreme,vreo,vreun,zece,zero,zi,zice
|
@@ -1,159 +1 @@
|
|
1
|
-
|
2
|
-
×
|
3
|
-
×Ï
|
4
|
-
ÎÅ
|
5
|
-
ÞÔÏ
|
6
|
-
ÏÎ
|
7
|
-
ÎÁ
|
8
|
-
Ñ
|
9
|
-
Ó
|
10
|
-
ÓÏ
|
11
|
-
ËÁË
|
12
|
-
Á
|
13
|
-
ÔÏ
|
14
|
-
×ÓÅ
|
15
|
-
ÏÎÁ
|
16
|
-
ÔÁË
|
17
|
-
ÅÇÏ
|
18
|
-
ÎÏ
|
19
|
-
ÄÁ
|
20
|
-
ÔÙ
|
21
|
-
Ë
|
22
|
-
Õ
|
23
|
-
ÖÅ
|
24
|
-
×Ù
|
25
|
-
ÚÁ
|
26
|
-
ÂÙ
|
27
|
-
ÐÏ
|
28
|
-
ÔÏÌØËÏ
|
29
|
-
ÅÅ
|
30
|
-
ÍÎÅ
|
31
|
-
ÂÙÌÏ
|
32
|
-
×ÏÔ
|
33
|
-
ÏÔ
|
34
|
-
ÍÅÎÑ
|
35
|
-
ÅÝÅ
|
36
|
-
ÎÅÔ
|
37
|
-
Ï
|
38
|
-
ÉÚ
|
39
|
-
ÅÍÕ
|
40
|
-
ÔÅÐÅÒØ
|
41
|
-
ËÏÇÄÁ
|
42
|
-
ÄÁÖÅ
|
43
|
-
ÎÕ
|
44
|
-
×ÄÒÕÇ
|
45
|
-
ÌÉ
|
46
|
-
ÅÓÌÉ
|
47
|
-
ÕÖÅ
|
48
|
-
ÉÌÉ
|
49
|
-
ÎÉ
|
50
|
-
ÂÙÔØ
|
51
|
-
ÂÙÌ
|
52
|
-
ÎÅÇÏ
|
53
|
-
ÄÏ
|
54
|
-
×ÁÓ
|
55
|
-
ÎÉÂÕÄØ
|
56
|
-
ÏÐÑÔØ
|
57
|
-
ÕÖ
|
58
|
-
×ÁÍ
|
59
|
-
ÓËÁÚÁÌ
|
60
|
-
×ÅÄØ
|
61
|
-
ÔÁÍ
|
62
|
-
ÐÏÔÏÍ
|
63
|
-
ÓÅÂÑ
|
64
|
-
ÎÉÞÅÇÏ
|
65
|
-
ÅÊ
|
66
|
-
ÍÏÖÅÔ
|
67
|
-
ÏÎÉ
|
68
|
-
ÔÕÔ
|
69
|
-
ÇÄÅ
|
70
|
-
ÅÓÔØ
|
71
|
-
ÎÁÄÏ
|
72
|
-
ÎÅÊ
|
73
|
-
ÄÌÑ
|
74
|
-
ÍÙ
|
75
|
-
ÔÅÂÑ
|
76
|
-
ÉÈ
|
77
|
-
ÞÅÍ
|
78
|
-
ÂÙÌÁ
|
79
|
-
ÓÁÍ
|
80
|
-
ÞÔÏÂ
|
81
|
-
ÂÅÚ
|
82
|
-
ÂÕÄÔÏ
|
83
|
-
ÞÅÌÏ×ÅË
|
84
|
-
ÞÅÇÏ
|
85
|
-
ÒÁÚ
|
86
|
-
ÔÏÖÅ
|
87
|
-
ÓÅÂÅ
|
88
|
-
ÐÏÄ
|
89
|
-
ÖÉÚÎØ
|
90
|
-
ÂÕÄÅÔ
|
91
|
-
Ö
|
92
|
-
ÔÏÇÄÁ
|
93
|
-
ËÔÏ
|
94
|
-
ÜÔÏÔ
|
95
|
-
ÇÏ×ÏÒÉÌ
|
96
|
-
ÔÏÇÏ
|
97
|
-
ÐÏÔÏÍÕ
|
98
|
-
ÜÔÏÇÏ
|
99
|
-
ËÁËÏÊ
|
100
|
-
ÓÏ×ÓÅÍ
|
101
|
-
ÎÉÍ
|
102
|
-
ÚÄÅÓØ
|
103
|
-
ÜÔÏÍ
|
104
|
-
ÏÄÉÎ
|
105
|
-
ÐÏÞÔÉ
|
106
|
-
ÍÏÊ
|
107
|
-
ÔÅÍ
|
108
|
-
ÞÔÏÂÙ
|
109
|
-
ÎÅÅ
|
110
|
-
ËÁÖÅÔÓÑ
|
111
|
-
ÓÅÊÞÁÓ
|
112
|
-
ÂÙÌÉ
|
113
|
-
ËÕÄÁ
|
114
|
-
ÚÁÞÅÍ
|
115
|
-
ÓËÁÚÁÔØ
|
116
|
-
×ÓÅÈ
|
117
|
-
ÎÉËÏÇÄÁ
|
118
|
-
ÓÅÇÏÄÎÑ
|
119
|
-
ÍÏÖÎÏ
|
120
|
-
ÐÒÉ
|
121
|
-
ÎÁËÏÎÅÃ
|
122
|
-
Ä×Á
|
123
|
-
ÏÂ
|
124
|
-
ÄÒÕÇÏÊ
|
125
|
-
ÈÏÔØ
|
126
|
-
ÐÏÓÌÅ
|
127
|
-
ÎÁÄ
|
128
|
-
ÂÏÌØÛÅ
|
129
|
-
ÔÏÔ
|
130
|
-
ÞÅÒÅÚ
|
131
|
-
ÜÔÉ
|
132
|
-
ÎÁÓ
|
133
|
-
ÐÒÏ
|
134
|
-
×ÓÅÇÏ
|
135
|
-
ÎÉÈ
|
136
|
-
ËÁËÁÑ
|
137
|
-
ÍÎÏÇÏ
|
138
|
-
ÒÁÚ×Å
|
139
|
-
ÓËÁÚÁÌÁ
|
140
|
-
ÔÒÉ
|
141
|
-
ÜÔÕ
|
142
|
-
ÍÏÑ
|
143
|
-
×ÐÒÏÞÅÍ
|
144
|
-
ÈÏÒÏÛÏ
|
145
|
-
Ó×ÏÀ
|
146
|
-
ÜÔÏÊ
|
147
|
-
ÐÅÒÅÄ
|
148
|
-
ÉÎÏÇÄÁ
|
149
|
-
ÌÕÞÛÅ
|
150
|
-
ÞÕÔØ
|
151
|
-
ÔÏÍ
|
152
|
-
ÎÅÌØÚÑ
|
153
|
-
ÔÁËÏÊ
|
154
|
-
ÉÍ
|
155
|
-
ÂÏÌÅÅ
|
156
|
-
×ÓÅÇÄÁ
|
157
|
-
ËÏÎÅÞÎÏ
|
158
|
-
×ÓÀ
|
159
|
-
ÍÅÖÄÕ
|
1
|
+
и,в,во,не,что,он,на,я,с,со,как,а,то,все,она,так,его,но,да,ты,к,у,же,вы,за,бы,по,только,ее,мне,было,вот,от,меня,еще,нет,о,из,ему,теперь,когда,даже,ну,вдруг,ли,если,уже,или,ни,быть,был,него,до,вас,нибудь,опять,уж,вам,сказал,ведь,там,потом,себя,ничего,ей,может,они,тут,где,есть,надо,ней,для,мы,тебя,их,чем,была,сам,чтоб,без,будто,человек,чего,раз,тоже,себе,под,жизнь,будет,ж,тогда,кто,этот,говорил,того,потому,этого,какой,совсем,ним,здесь,этом,один,почти,мой,тем,чтобы,нее,кажется,сейчас,были,куда,зачем,сказать,всех,никогда,сегодня,можно,при,наконец,два,об,другой,хоть,после,над,больше,тот,через,эти,нас,про,всего,них,какая,много,разве,сказала,три,эту,моя,впрочем,хорошо,свою,этой,перед,иногда,лучше,чуть,том,нельзя,такой,им,более,всегда,конечно,всю,между
|
@@ -1,114 +1 @@
|
|
1
|
-
och
|
2
|
-
det
|
3
|
-
att
|
4
|
-
i
|
5
|
-
en
|
6
|
-
jag
|
7
|
-
hon
|
8
|
-
som
|
9
|
-
han
|
10
|
-
på
|
11
|
-
den
|
12
|
-
med
|
13
|
-
var
|
14
|
-
sig
|
15
|
-
för
|
16
|
-
så
|
17
|
-
till
|
18
|
-
är
|
19
|
-
men
|
20
|
-
ett
|
21
|
-
om
|
22
|
-
hade
|
23
|
-
de
|
24
|
-
av
|
25
|
-
icke
|
26
|
-
mig
|
27
|
-
du
|
28
|
-
henne
|
29
|
-
då
|
30
|
-
sin
|
31
|
-
nu
|
32
|
-
har
|
33
|
-
inte
|
34
|
-
hans
|
35
|
-
honom
|
36
|
-
skulle
|
37
|
-
hennes
|
38
|
-
där
|
39
|
-
min
|
40
|
-
man
|
41
|
-
ej
|
42
|
-
vid
|
43
|
-
kunde
|
44
|
-
något
|
45
|
-
från
|
46
|
-
ut
|
47
|
-
när
|
48
|
-
efter
|
49
|
-
upp
|
50
|
-
vi
|
51
|
-
dem
|
52
|
-
vara
|
53
|
-
vad
|
54
|
-
över
|
55
|
-
än
|
56
|
-
dig
|
57
|
-
kan
|
58
|
-
sina
|
59
|
-
här
|
60
|
-
ha
|
61
|
-
mot
|
62
|
-
alla
|
63
|
-
under
|
64
|
-
någon
|
65
|
-
eller
|
66
|
-
allt
|
67
|
-
mycket
|
68
|
-
sedan
|
69
|
-
ju
|
70
|
-
denna
|
71
|
-
själv
|
72
|
-
detta
|
73
|
-
åt
|
74
|
-
utan
|
75
|
-
varit
|
76
|
-
hur
|
77
|
-
ingen
|
78
|
-
mitt
|
79
|
-
ni
|
80
|
-
bli
|
81
|
-
blev
|
82
|
-
oss
|
83
|
-
din
|
84
|
-
dessa
|
85
|
-
några
|
86
|
-
deras
|
87
|
-
blir
|
88
|
-
mina
|
89
|
-
samma
|
90
|
-
vilken
|
91
|
-
er
|
92
|
-
sådan
|
93
|
-
vår
|
94
|
-
blivit
|
95
|
-
dess
|
96
|
-
inom
|
97
|
-
mellan
|
98
|
-
sådant
|
99
|
-
varför
|
100
|
-
varje
|
101
|
-
vilka
|
102
|
-
ditt
|
103
|
-
vem
|
104
|
-
vilket
|
105
|
-
sitta
|
106
|
-
sådana
|
107
|
-
vart
|
108
|
-
dina
|
109
|
-
vars
|
110
|
-
vårt
|
111
|
-
våra
|
112
|
-
ert
|
113
|
-
era
|
114
|
-
vilkas
|
1
|
+
och,det,att,i,en,jag,hon,som,han,på,den,med,var,sig,för,så,till,är,men,ett,om,hade,de,av,icke,mig,du,henne,då,sin,nu,har,inte,hans,honom,skulle,hennes,där,min,man,ej,vid,kunde,något,från,ut,när,efter,upp,vi,dem,vara,vad,över,än,dig,kan,sina,här,ha,mot,alla,under,någon,eller,allt,mycket,sedan,ju,denna,själv,detta,åt,utan,varit,hur,ingen,mitt,ni,bli,blev,oss,din,dessa,några,deras,blir,mina,samma,vilken,er,sådan,vår,blivit,dess,inom,mellan,sådant,varför,varje,vilka,ditt,vem,vilket,sitta,sådana,vart,dina,vars,vårt,våra,ert,era,vilkas,
|
@@ -1,16 +1,20 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
1
|
+
module Stopwords
|
2
|
+
module Snowball
|
3
|
+
class Stopwords::Snowball::WordSieve
|
4
|
+
def initialize custom_list = []
|
5
|
+
@filters = Dir[File.dirname(__FILE__) + '/locales/*.csv'].each_with_object({}) do |file, filters|
|
6
|
+
lang = File.basename(file, '.csv').to_sym
|
7
|
+
filters[lang] = Stopwords::Snowball::Filter.new lang, custom_list
|
8
|
+
end
|
9
|
+
end
|
8
10
|
|
9
|
-
|
10
|
-
|
11
|
-
|
11
|
+
def stopword? args={}
|
12
|
+
args[:lang] ? @filters[args[:lang]].stopword?(args[:word] ) : false
|
13
|
+
end
|
12
14
|
|
13
|
-
|
14
|
-
|
15
|
+
def filter args={}
|
16
|
+
args[:lang] ? @filters[args[:lang]].filter(args[:words] ) : args[:words]
|
17
|
+
end
|
18
|
+
end
|
15
19
|
end
|
16
|
-
end
|
20
|
+
end
|
data/spec/lib/filter_spec.rb
CHANGED
@@ -9,9 +9,9 @@ describe Stopwords::Filter do
|
|
9
9
|
|
10
10
|
subject { filter }
|
11
11
|
|
12
|
-
it("should remove the stopwords for the list of words to be filtered") { filter.filter("desde Santurce a Bilbao".split).
|
12
|
+
it("should remove the stopwords for the list of words to be filtered") { expect(filter.filter("desde Santurce a Bilbao".split)).to eq ["Santurce", "Bilbao"]}
|
13
13
|
|
14
14
|
end
|
15
15
|
|
16
16
|
|
17
|
-
end
|
17
|
+
end
|
@@ -9,11 +9,32 @@ describe Stopwords::Snowball::Filter do
|
|
9
9
|
|
10
10
|
subject { filter }
|
11
11
|
|
12
|
-
|
12
|
+
it("should have the appropiate stopwords") { expect(subject.stopwords).to eq ["de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por", "un", "para", "con", "no", "una", "su", "al", "lo", "como", "más", "pero", "sus", "le", "ya", "o", "este", "sí", "porque", "esta", "entre", "cuando", "muy", "sin", "sobre", "también", "me", "hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante", "todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante", "ellos", "e", "esto", "mí", "antes", "algunos", "qué", "unos", "yo", "otro", "otras", "otra", "él", "tanto", "esa", "estos", "mucho", "quienes", "nada", "muchos", "cual", "poco", "ella", "estar", "estas", "algunas", "algo", "nosotros", "mi", "mis", "tú", "te", "ti", "tu", "tus", "ellas", "nosotras", "vosotros", "vosotras", "os", "mío", "mía", "míos", "mías", "tuyo", "tuya", "tuyos", "tuyas", "suyo", "suya", "suyos", "suyas", "nuestro", "nuestra", "nuestros", "nuestras", "vuestro", "vuestra", "vuestros", "vuestras", "esos", "esas", "estoy", "estás", "está", "estamos", "estáis", "están", "esté", "estés", "estemos", "estéis", "estén", "estaré", "estarás", "estará", "estaremos", "estaréis", "estarán", "estaría", "estarías", "estaríamos", "estaríais", "estarían", "estaba", "estabas", "estábamos", "estabais", "estaban", "estuve", "estuviste", "estuvo", "estuvimos", "estuvisteis", "estuvieron", "estuviera", "estuvieras", "estuviéramos", "estuvierais", "estuvieran", "estuviese", "estuvieses", "estuviésemos", "estuvieseis", "estuviesen", "estando", "estado", "estada", "estados", "estadas", "estad", "he", "has", "ha", "hemos", "habéis", "han", "haya", "hayas", "hayamos", "hayáis", "hayan", "habré", "habrás", "habrá", "habremos", "habréis", "habrán", "habría", "habrías", "habríamos", "habríais", "habrían", "había", "habías", "habíamos", "habíais", "habían", "hube", "hubiste", "hubo", "hubimos", "hubisteis", "hubieron", "hubiera", "hubieras", "hubiéramos", "hubierais", "hubieran", "hubiese", "hubieses", "hubiésemos", "hubieseis", "hubiesen", "habiendo", "habido", "habida", "habidos", "habidas", "soy", "eres", "es", "somos", "sois", "son", "sea", "seas", "seamos", "seáis", "sean", "seré", "serás", "será", "seremos", "seréis", "serán", "sería", "serías", "seríamos", "seríais", "serían", "era", "eras", "éramos", "erais", "eran", "fui", "fuiste", "fue", "fuimos", "fuisteis", "fueron", "fuera", "fueras", "fuéramos", "fuerais", "fueran", "fuese", "fueses", "fuésemos", "fueseis", "fuesen", "siendo", "sido", "tengo", "tienes", "tiene", "tenemos", "tenéis", "tienen", "tenga", "tengas", "tengamos", "tengáis", "tengan", "tendré", "tendrás", "tendrá", "tendremos", "tendréis", "tendrán", "tendría", "tendrías", "tendríamos", "tendríais", "tendrían", "tenía", "tenías", "teníamos", "teníais", "tenían", "tuve", "tuviste", "tuvo", "tuvimos", "tuvisteis", "tuvieron", "tuviera", "tuvieras", "tuviéramos", "tuvierais", "tuvieran", "tuviese", "tuvieses", "tuviésemos", "tuvieseis", "tuviesen", "teniendo", "tenido", "tenida", "tenidos", "tenidas", "tened"]}
|
13
13
|
|
14
|
-
it("should remove the stopwords for the list of words to be filtered") { filter.filter("desde Santurce a Bilbao".split).
|
14
|
+
it("should remove the stopwords for the list of words to be filtered") { expect(filter.filter("desde Santurce a Bilbao".split)).to eq ["Santurce", "Bilbao"]}
|
15
15
|
|
16
16
|
end
|
17
17
|
|
18
|
+
context "when custom list" do
|
18
19
|
|
19
|
-
|
20
|
+
let (:filter) { Stopwords::Snowball::Filter.new "es", ["Santurce"] }
|
21
|
+
|
22
|
+
subject { filter }
|
23
|
+
|
24
|
+
it("should remove the stopwords for the list of words to be filtered") { expect(filter.filter("desde Santurce a Bilbao".split)).to eq ["Bilbao"]}
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
context "when locale is not found" do
|
29
|
+
|
30
|
+
let (:filter) { Stopwords::Snowball::Filter.new "no-real" }
|
31
|
+
|
32
|
+
subject { filter }
|
33
|
+
|
34
|
+
it("should throw an error explaining the locale not found") { expect{filter}.to raise_error(ArgumentError)}
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
|
40
|
+
end
|
metadata
CHANGED
@@ -1,42 +1,39 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: stopwords-filter
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
version: 0.2.1
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.5.0
|
6
5
|
platform: ruby
|
7
|
-
authors:
|
6
|
+
authors:
|
8
7
|
- David J. Brenes
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
|
13
|
-
date: 2013-02-03 00:00:00 Z
|
11
|
+
date: 2021-06-09 00:00:00.000000000 Z
|
14
12
|
dependencies: []
|
15
|
-
|
16
|
-
|
13
|
+
description: Small library that allows you to create a simple stopwords filter or
|
14
|
+
use some based on Snowball stopwords lists
|
17
15
|
email: davidjbrenes@gmail.com
|
18
16
|
executables: []
|
19
|
-
|
20
17
|
extensions: []
|
21
|
-
|
22
|
-
extra_rdoc_files:
|
18
|
+
extra_rdoc_files:
|
23
19
|
- CHANGELOG
|
24
20
|
- LICENSE.txt
|
25
21
|
- README.md
|
26
|
-
files:
|
22
|
+
files:
|
27
23
|
- CHANGELOG
|
28
24
|
- Gemfile
|
29
25
|
- Gemfile.lock
|
26
|
+
- LICENSE.txt
|
30
27
|
- README.md
|
31
28
|
- VERSION
|
32
29
|
- lib/stopwords.rb
|
33
30
|
- lib/stopwords/filter.rb
|
34
31
|
- lib/stopwords/snowball.rb
|
35
32
|
- lib/stopwords/snowball/filter.rb
|
36
|
-
- lib/stopwords/snowball/wordsieve.rb
|
37
33
|
- lib/stopwords/snowball/locales/bg.csv
|
38
34
|
- lib/stopwords/snowball/locales/da.csv
|
39
35
|
- lib/stopwords/snowball/locales/de.csv
|
36
|
+
- lib/stopwords/snowball/locales/el.csv
|
40
37
|
- lib/stopwords/snowball/locales/en.csv
|
41
38
|
- lib/stopwords/snowball/locales/es.csv
|
42
39
|
- lib/stopwords/snowball/locales/fn.csv
|
@@ -44,39 +41,37 @@ files:
|
|
44
41
|
- lib/stopwords/snowball/locales/hu.csv
|
45
42
|
- lib/stopwords/snowball/locales/it.csv
|
46
43
|
- lib/stopwords/snowball/locales/nl.csv
|
44
|
+
- lib/stopwords/snowball/locales/pl.csv
|
47
45
|
- lib/stopwords/snowball/locales/pt.csv
|
46
|
+
- lib/stopwords/snowball/locales/ro.csv
|
48
47
|
- lib/stopwords/snowball/locales/ru.csv
|
49
48
|
- lib/stopwords/snowball/locales/sv.csv
|
49
|
+
- lib/stopwords/snowball/wordsieve.rb
|
50
50
|
- spec/lib/filter_spec.rb
|
51
51
|
- spec/lib/snowball_filter_spec.rb
|
52
52
|
- spec/spec_helper.rb
|
53
|
-
- LICENSE.txt
|
54
53
|
homepage: http://github.com/brenes/stopwords-filter
|
55
|
-
licenses:
|
54
|
+
licenses:
|
56
55
|
- MIT
|
56
|
+
metadata: {}
|
57
57
|
post_install_message:
|
58
58
|
rdoc_options: []
|
59
|
-
|
60
|
-
require_paths:
|
59
|
+
require_paths:
|
61
60
|
- lib
|
62
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
-
|
64
|
-
requirements:
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
65
63
|
- - ">="
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
version:
|
68
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
69
|
-
|
70
|
-
requirements:
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
71
68
|
- - ">="
|
72
|
-
- !ruby/object:Gem::Version
|
73
|
-
version:
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
74
71
|
requirements: []
|
75
|
-
|
76
72
|
rubyforge_project:
|
77
|
-
rubygems_version:
|
73
|
+
rubygems_version: 2.5.1
|
78
74
|
signing_key:
|
79
|
-
specification_version:
|
75
|
+
specification_version: 4
|
80
76
|
summary: Snowball based filters for stopwords
|
81
77
|
test_files: []
|
82
|
-
|