stopwords-filter 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +7 -0
- data/README.md +29 -3
- data/VERSION +1 -1
- data/lib/stopwords/filter.rb +5 -1
- data/lib/stopwords/snowball.rb +1 -0
- data/lib/stopwords/snowball/locales/bg.csv +1 -0
- data/lib/stopwords/snowball/wordsieve.rb +16 -0
- metadata +5 -1
data/CHANGELOG
ADDED
data/README.md
CHANGED
@@ -29,7 +29,12 @@ in your Gemfile.
|
|
29
29
|
```
|
30
30
|
stopwords = ['by', 'written', 'from']
|
31
31
|
filter = Stopwords::Filter.new stopwords
|
32
|
-
|
32
|
+
|
33
|
+
filter.filter 'guide by douglas adams'.split
|
34
|
+
# ['guide', 'douglas', 'adams']
|
35
|
+
|
36
|
+
filter.stopword? 'by'
|
37
|
+
# true
|
33
38
|
```
|
34
39
|
|
35
40
|
2. Snowball version
|
@@ -37,7 +42,23 @@ filter.filter 'guide by douglas adams'.split #-> ['guide', 'douglas', 'adams']
|
|
37
42
|
|
38
43
|
```
|
39
44
|
filter = Stopwords::Snowball::Filter.new "en"
|
40
|
-
filter.filter 'guide by douglas adams'.split
|
45
|
+
filter.filter 'guide by douglas adams'.split
|
46
|
+
# ['guide', 'douglas', 'adams']
|
47
|
+
|
48
|
+
filter.stopword? 'by'
|
49
|
+
# true
|
50
|
+
```
|
51
|
+
|
52
|
+
2.1 Snowball version with Sieve class (thanks to @s2gatev)
|
53
|
+
|
54
|
+
```ruby
|
55
|
+
sieve = Stopwords::Snowball::WordSieve.new
|
56
|
+
|
57
|
+
filtered = sieve.filter lang: :en, words: 'guide by douglas adams'.split
|
58
|
+
# filtered = ['guide', 'douglas', 'adams']
|
59
|
+
|
60
|
+
sieve.stopword? lang: :en, word: 'by'
|
61
|
+
# true
|
41
62
|
```
|
42
63
|
|
43
64
|
|
@@ -103,6 +124,10 @@ Anything else?
|
|
103
124
|
|
104
125
|
In a future version I would like to include a chaining filter where you include a series of operations and they are executed in a lineal order, just like the [Pipes and Filters design pattern][wikipedia_pipes_filters]
|
105
126
|
|
127
|
+
Ackonowledgments
|
128
|
+
----------------
|
129
|
+
|
130
|
+
Thanks to @s2gatev who added the `stopword?` method and the sieve class to this gem
|
106
131
|
|
107
132
|
[wikipedia_stopwords]: http://en.wikipedia.org/wiki/Stopword
|
108
133
|
[solr]: https://github.com/sunspot/sunspot
|
@@ -110,4 +135,5 @@ In a future version I would like to include a chaining filter where you include
|
|
110
135
|
[google]: https://github.com/alexreisner/google_custom_search
|
111
136
|
[postgre]: https://github.com/Casecommons/pg_search
|
112
137
|
[wikipedia_snowball]: http://en.wikipedia.org/wiki/Snowball_programming_language
|
113
|
-
[wikipedia_pipes_filters]: http://en.wikipedia.org/wiki/Pipes_and_filters
|
138
|
+
[wikipedia_pipes_filters]: http://en.wikipedia.org/wiki/Pipes_and_filters
|
139
|
+
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/lib/stopwords/filter.rb
CHANGED
data/lib/stopwords/snowball.rb
CHANGED
@@ -0,0 +1 @@
|
|
1
|
+
а,автентичен,аз,ако,ала,бе,без,беше,би,бивш,бивша,бившо,бил,била,били,било,благодаря,близо,бъдат,бъде,бяха,в,вас,ваш,ваша,вероятно,вече,взема,ви,вие,винаги,внимава,време,все,всеки,всички,всичко,всяка,във,въпреки,върху,г,ги,главен,главна,главно,глас,го,година,години,годишен,д,да,дали,два,двама,двамата,две,двете,ден,днес,дни,до,добра,добре,добро,добър,докато,докога,дори,досега,доста,друг,друга,други,е,евтин,едва,един,една,еднаква,еднакви,еднакъв,едно,екип,ето,живот,за,забавям,зад,заедно,заради,засега,заспал,затова,защо,защото,и,из,или,им,има,имат,иска,й,каза,как,каква,какво,както,какъв,като,кога,когато,което,които,кой,който,колко,която,къде,където,към,лесен,лесно,ли,лош,м,май,малко,ме,между,мек,мен,месец,ми,много,мнозина,мога,могат,може,мокър,моля,момента,му,н,на,над,назад,най,направи,напред,например,нас,не,него,нещо,нея,ни,ние,никой,нито,нищо,но,нов,нова,нови,новина,някои,някой,няколко,няма,обаче,около,освен,особено,от,отгоре,отново,още,пак,по,повече,повечето,под,поне,поради,после,почти,прави,пред,преди,през,при,пък,първата,първи,първо,пъти,равен,равна,с,са,сам,само,се,сега,си,син,скоро,след,следващ,сме,смях,според,сред,срещу,сте,съм,със,също,т,тази,така,такива,такъв,там,твой,те,тези,ти,т.н.,то,това,тогава,този,той,толкова,точно,три,трябва,тук,тъй,тя,тях,у,утре,харесва,хиляди,ч,часа,че,често,чрез,ще,щом,юмрук,я,як
|
@@ -0,0 +1,16 @@
|
|
1
|
+
class Stopwords::Snowball::WordSieve
|
2
|
+
def initialize
|
3
|
+
@filters = Dir[File.dirname(__FILE__) + '/locales/*.csv'].each_with_object({}) do |file, filters|
|
4
|
+
lang = File.basename(file, '.csv').to_sym
|
5
|
+
filters[lang] = Stopwords::Snowball::Filter.new lang
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
def stopword? args={}
|
10
|
+
args[:lang] ? @filters[args[:lang]].stopword?(args[:word] ) : false
|
11
|
+
end
|
12
|
+
|
13
|
+
def filter args={}
|
14
|
+
args[:lang] ? @filters[args[:lang]].filter(args[:words] ) : args[:words]
|
15
|
+
end
|
16
|
+
end
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: stopwords-filter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.2.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- David J. Brenes
|
@@ -20,9 +20,11 @@ executables: []
|
|
20
20
|
extensions: []
|
21
21
|
|
22
22
|
extra_rdoc_files:
|
23
|
+
- CHANGELOG
|
23
24
|
- LICENSE.txt
|
24
25
|
- README.md
|
25
26
|
files:
|
27
|
+
- CHANGELOG
|
26
28
|
- Gemfile
|
27
29
|
- Gemfile.lock
|
28
30
|
- README.md
|
@@ -31,6 +33,8 @@ files:
|
|
31
33
|
- lib/stopwords/filter.rb
|
32
34
|
- lib/stopwords/snowball.rb
|
33
35
|
- lib/stopwords/snowball/filter.rb
|
36
|
+
- lib/stopwords/snowball/wordsieve.rb
|
37
|
+
- lib/stopwords/snowball/locales/bg.csv
|
34
38
|
- lib/stopwords/snowball/locales/da.csv
|
35
39
|
- lib/stopwords/snowball/locales/de.csv
|
36
40
|
- lib/stopwords/snowball/locales/en.csv
|