stopwords-filter 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +7 -0
- data/README.md +29 -3
- data/VERSION +1 -1
- data/lib/stopwords/filter.rb +5 -1
- data/lib/stopwords/snowball.rb +1 -0
- data/lib/stopwords/snowball/locales/bg.csv +1 -0
- data/lib/stopwords/snowball/wordsieve.rb +16 -0
- metadata +5 -1
data/CHANGELOG
ADDED
data/README.md
CHANGED
@@ -29,7 +29,12 @@ in your Gemfile.
|
|
29
29
|
```
|
30
30
|
stopwords = ['by', 'written', 'from']
|
31
31
|
filter = Stopwords::Filter.new stopwords
|
32
|
-
|
32
|
+
|
33
|
+
filter.filter 'guide by douglas adams'.split
|
34
|
+
# ['guide', 'douglas', 'adams']
|
35
|
+
|
36
|
+
filter.stopword? 'by'
|
37
|
+
# true
|
33
38
|
```
|
34
39
|
|
35
40
|
2. Snowball version
|
@@ -37,7 +42,23 @@ filter.filter 'guide by douglas adams'.split #-> ['guide', 'douglas', 'adams']
|
|
37
42
|
|
38
43
|
```
|
39
44
|
filter = Stopwords::Snowball::Filter.new "en"
|
40
|
-
filter.filter 'guide by douglas adams'.split
|
45
|
+
filter.filter 'guide by douglas adams'.split
|
46
|
+
# ['guide', 'douglas', 'adams']
|
47
|
+
|
48
|
+
filter.stopword? 'by'
|
49
|
+
# true
|
50
|
+
```
|
51
|
+
|
52
|
+
2.1 Snowball version with Sieve class (thanks to @s2gatev)
|
53
|
+
|
54
|
+
```ruby
|
55
|
+
sieve = Stopwords::Snowball::WordSieve.new
|
56
|
+
|
57
|
+
filtered = sieve.filter lang: :en, words: 'guide by douglas adams'.split
|
58
|
+
# filtered = ['guide', 'douglas', 'adams']
|
59
|
+
|
60
|
+
sieve.stopword? lang: :en, word: 'by'
|
61
|
+
# true
|
41
62
|
```
|
42
63
|
|
43
64
|
|
@@ -103,6 +124,10 @@ Anything else?
|
|
103
124
|
|
104
125
|
In a future version I would like to include a chaining filter where you include a series of operations and they are executed in a lineal order, just like the [Pipes and Filters design pattern][wikipedia_pipes_filters]
|
105
126
|
|
127
|
+
Ackonowledgments
|
128
|
+
----------------
|
129
|
+
|
130
|
+
Thanks to @s2gatev who added the `stopword?` method and the sieve class to this gem
|
106
131
|
|
107
132
|
[wikipedia_stopwords]: http://en.wikipedia.org/wiki/Stopword
|
108
133
|
[solr]: https://github.com/sunspot/sunspot
|
@@ -110,4 +135,5 @@ In a future version I would like to include a chaining filter where you include
|
|
110
135
|
[google]: https://github.com/alexreisner/google_custom_search
|
111
136
|
[postgre]: https://github.com/Casecommons/pg_search
|
112
137
|
[wikipedia_snowball]: http://en.wikipedia.org/wiki/Snowball_programming_language
|
113
|
-
[wikipedia_pipes_filters]: http://en.wikipedia.org/wiki/Pipes_and_filters
|
138
|
+
[wikipedia_pipes_filters]: http://en.wikipedia.org/wiki/Pipes_and_filters
|
139
|
+
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/lib/stopwords/filter.rb
CHANGED
data/lib/stopwords/snowball.rb
CHANGED
@@ -0,0 +1 @@
|
|
1
|
+
а,автентичен,аз,ако,ала,бе,без,беше,би,бивш,бивша,бившо,бил,била,били,било,благодаря,близо,бъдат,бъде,бяха,в,вас,ваш,ваша,вероятно,вече,взема,ви,вие,винаги,внимава,време,все,всеки,всички,всичко,всяка,във,въпреки,върху,г,ги,главен,главна,главно,глас,го,година,години,годишен,д,да,дали,два,двама,двамата,две,двете,ден,днес,дни,до,добра,добре,добро,добър,докато,докога,дори,досега,доста,друг,друга,други,е,евтин,едва,един,една,еднаква,еднакви,еднакъв,едно,екип,ето,живот,за,забавям,зад,заедно,заради,засега,заспал,затова,защо,защото,и,из,или,им,има,имат,иска,й,каза,как,каква,какво,както,какъв,като,кога,когато,което,които,кой,който,колко,която,къде,където,към,лесен,лесно,ли,лош,м,май,малко,ме,между,мек,мен,месец,ми,много,мнозина,мога,могат,може,мокър,моля,момента,му,н,на,над,назад,най,направи,напред,например,нас,не,него,нещо,нея,ни,ние,никой,нито,нищо,но,нов,нова,нови,новина,някои,някой,няколко,няма,обаче,около,освен,особено,от,отгоре,отново,още,пак,по,повече,повечето,под,поне,поради,после,почти,прави,пред,преди,през,при,пък,първата,първи,първо,пъти,равен,равна,с,са,сам,само,се,сега,си,син,скоро,след,следващ,сме,смях,според,сред,срещу,сте,съм,със,също,т,тази,така,такива,такъв,там,твой,те,тези,ти,т.н.,то,това,тогава,този,той,толкова,точно,три,трябва,тук,тъй,тя,тях,у,утре,харесва,хиляди,ч,часа,че,често,чрез,ще,щом,юмрук,я,як
|
@@ -0,0 +1,16 @@
|
|
1
|
+
class Stopwords::Snowball::WordSieve
|
2
|
+
def initialize
|
3
|
+
@filters = Dir[File.dirname(__FILE__) + '/locales/*.csv'].each_with_object({}) do |file, filters|
|
4
|
+
lang = File.basename(file, '.csv').to_sym
|
5
|
+
filters[lang] = Stopwords::Snowball::Filter.new lang
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
def stopword? args={}
|
10
|
+
args[:lang] ? @filters[args[:lang]].stopword?(args[:word] ) : false
|
11
|
+
end
|
12
|
+
|
13
|
+
def filter args={}
|
14
|
+
args[:lang] ? @filters[args[:lang]].filter(args[:words] ) : args[:words]
|
15
|
+
end
|
16
|
+
end
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: stopwords-filter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.2.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- David J. Brenes
|
@@ -20,9 +20,11 @@ executables: []
|
|
20
20
|
extensions: []
|
21
21
|
|
22
22
|
extra_rdoc_files:
|
23
|
+
- CHANGELOG
|
23
24
|
- LICENSE.txt
|
24
25
|
- README.md
|
25
26
|
files:
|
27
|
+
- CHANGELOG
|
26
28
|
- Gemfile
|
27
29
|
- Gemfile.lock
|
28
30
|
- README.md
|
@@ -31,6 +33,8 @@ files:
|
|
31
33
|
- lib/stopwords/filter.rb
|
32
34
|
- lib/stopwords/snowball.rb
|
33
35
|
- lib/stopwords/snowball/filter.rb
|
36
|
+
- lib/stopwords/snowball/wordsieve.rb
|
37
|
+
- lib/stopwords/snowball/locales/bg.csv
|
34
38
|
- lib/stopwords/snowball/locales/da.csv
|
35
39
|
- lib/stopwords/snowball/locales/de.csv
|
36
40
|
- lib/stopwords/snowball/locales/en.csv
|