brstemmer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: f24a3b211508cbc82cb877169e00101e9afbd586
4
+ data.tar.gz: 1d0a21ee10cc280baea651abd8e260f28e10c165
5
+ SHA512:
6
+ metadata.gz: 45f02c3d9b46eff460ed06a4748e0588661df4a0f67fc38a5d5456590335d9360c31edec5dd5f597728bd3beaed40b5ea183e9ebe0f511b3bb69e1cf566f6558
7
+ data.tar.gz: 19264e5b908a282807ceeaea9457238b4856aefd0ad061603a71ceff6e10e8de3c9d5c972abe695e34358d4a6e1a7286c4a6d183b904a2e11947d7e269b110a1
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ .byebug_history
2
+ .idea
3
+ /.bundle/
4
+ /.yardoc
5
+ /Gemfile.lock
6
+ /_yardoc/
7
+ /coverage/
8
+ /doc/
9
+ /pkg/
10
+ /spec/reports/
11
+ /tmp/
@@ -0,0 +1,49 @@
1
+ # Contributor Code of Conduct
2
+
3
+ As contributors and maintainers of this project, and in the interest of
4
+ fostering an open and welcoming community, we pledge to respect all people who
5
+ contribute through reporting issues, posting feature requests, updating
6
+ documentation, submitting pull requests or patches, and other activities.
7
+
8
+ We are committed to making participation in this project a harassment-free
9
+ experience for everyone, regardless of level of experience, gender, gender
10
+ identity and expression, sexual orientation, disability, personal appearance,
11
+ body size, race, ethnicity, age, religion, or nationality.
12
+
13
+ Examples of unacceptable behavior by participants include:
14
+
15
+ * The use of sexualized language or imagery
16
+ * Personal attacks
17
+ * Trolling or insulting/derogatory comments
18
+ * Public or private harassment
19
+ * Publishing other's private information, such as physical or electronic
20
+ addresses, without explicit permission
21
+ * Other unethical or unprofessional conduct
22
+
23
+ Project maintainers have the right and responsibility to remove, edit, or
24
+ reject comments, commits, code, wiki edits, issues, and other contributions
25
+ that are not aligned to this Code of Conduct, or to ban temporarily or
26
+ permanently any contributor for other behaviors that they deem inappropriate,
27
+ threatening, offensive, or harmful.
28
+
29
+ By adopting this Code of Conduct, project maintainers commit themselves to
30
+ fairly and consistently applying these principles to every aspect of managing
31
+ this project. Project maintainers who do not follow or enforce the Code of
32
+ Conduct may be permanently removed from the project team.
33
+
34
+ This code of conduct applies both within project spaces and in public spaces
35
+ when an individual is representing the project or its community.
36
+
37
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
38
+ reported by contacting a project maintainer at renatocassino@gmail.com. All
39
+ complaints will be reviewed and investigated and will result in a response that
40
+ is deemed necessary and appropriate to the circumstances. Maintainers are
41
+ obligated to maintain confidentiality with regard to the reporter of an
42
+ incident.
43
+
44
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
45
+ version 1.3.0, available at
46
+ [http://contributor-covenant.org/version/1/3/0/][version]
47
+
48
+ [homepage]: http://contributor-covenant.org
49
+ [version]: http://contributor-covenant.org/version/1/3/0/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in brstemmer.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Renato Cassino
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,37 @@
1
+ # Brstemmer
2
+
3
+ Algorithm to implement the stemmer in portuguese language.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'brstemmer'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install brstemmer
20
+
21
+ ## Usage
22
+
23
+ ```ruby
24
+ puts "Correndo".brstemmer
25
+
26
+ # Corr
27
+ ```
28
+
29
+ ## Contributing
30
+
31
+ Bug reports and pull requests are welcome on GitHub at https://github.com/tacnoman/brstemmer. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
32
+
33
+
34
+ ## License
35
+
36
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
37
+
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "brstemmer"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
Binary file
data/brstemmer.gemspec ADDED
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'brstemmer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "brstemmer"
8
+ spec.version = Brstemmer::VERSION
9
+ spec.authors = ["Tacnoman"]
10
+ spec.email = ["renatocassino@gmail.com"]
11
+
12
+ spec.summary = %q{Helper to make a stemmer algorithm in pt-BR.}
13
+ spec.description = %q{Stemmer algorithm in pt-BR}
14
+ spec.homepage = "https://github.com/tacnoman/brstemmer.git"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_development_dependency "bundler", "~> 1.11"
23
+ spec.add_development_dependency "rake", "~> 10.0"
24
+ end
@@ -0,0 +1,3 @@
1
+ module Brstemmer
2
+ VERSION = "0.1.0"
3
+ end
data/lib/brstemmer.rb ADDED
@@ -0,0 +1,578 @@
1
+ # encoding: UTF-8
2
+
3
+ =begin
4
+ @todo Reorganize the code with methods rslpLoaderStemmer, rslpProcessWord and rslpUnloadStemmer
5
+ Url with rules: http://alvinalexander.com/java/jwarehouse/lucene/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/pt/portuguese.rslp.shtml
6
+ Algorithm explained in portuguese: http://www.lume.ufrgs.br/bitstream/handle/10183/23576/000597277.pdf?sequence=1
7
+ Utils: http://www.inf.ufrgs.br/~viviane/rslp/
8
+ =end
9
+
10
+ require "brstemmer/version"
11
+
12
+ module Brstemmer
13
+ # Steps file for the RSLP stemmer.
14
+ # Step 1: Plural Reduction
15
+ RULES = [
16
+ {
17
+ :properties => { name: "plural_reduction", size:3, exceptions:1},
18
+ :rules => [
19
+ # bons -> bom
20
+ ["ns",1,"m"],
21
+ # balões -> balão
22
+ ["ões",3,"ão"],
23
+ # capitães -> capitão
24
+ ["ães",1,"ão",["mães"]],
25
+ # normais -> normal
26
+ ["ais",1,"al",["cais","mais"]],
27
+ # papéis -> papel
28
+ ["éis",2,"el"],
29
+ # amáveis -> amável
30
+ ["eis",2,"el"],
31
+ # lençóis -> lençol
32
+ ["óis",2,"ol"],
33
+ # barris -> barril
34
+ ["is",2,"il",["lápis","cais","mais","crúcis","biquínis","pois","depois","dois","leis"]],
35
+ # males -> mal
36
+ ["les",3,"l"],
37
+ # mares -> mar
38
+ ["res",3,"r", ["árvores"]],
39
+ # casas -> casa
40
+ ["s",2,"",["aliás","pires","lápis","cais","mais","mas","menos","férias","fezes","pêsames","crúcis","gás","atrás","moisés","através","convés","ês","país","após","ambas","ambos","messias", "depois"]]
41
+ ]
42
+ },
43
+
44
+ # Step 2: Adverb Reduction
45
+ {
46
+ :properties => { name:"adverb_reduction", size:0, exceptions:0 },
47
+ :rules => [
48
+ # felizmente -> feliz
49
+ ["mente",4,"",["experimente"]]
50
+ ]
51
+ },
52
+
53
+ # Step 3: Feminine Reduction
54
+ {
55
+ :properties => { name:"feminine_reduction", size:3, exceptions:1 },
56
+ :rules => [
57
+ # chefona -> chefão
58
+ ["ona",3,"ão",["abandona","lona","iona","cortisona","monótona","maratona","acetona","detona","carona"]],
59
+ # vilã -> vilão
60
+ ["ã",2,"ão",["amanhã","arapuã","fã","divã"]],
61
+ # professora -> professor
62
+ ["ora",3,"or"],
63
+ # americana -> americano
64
+ ["na",4,"no",["carona","abandona","lona","iona","cortisona","monótona","maratona","acetona","detona","guiana","campana","grana","caravana","banana","paisana"]],
65
+ # sozinha -> sozinho
66
+ ["inha",3,"inho",["rainha","linha","minha"]],
67
+ # inglesa -> inglês
68
+ ["esa",3,"ês",["mesa","obesa","princesa","turquesa","ilesa","pesa","presa"]],
69
+ # famosa -> famoso
70
+ ["osa",3,"oso",["mucosa","prosa"]],
71
+ # maníaca -> maníaco
72
+ ["íaca",3,"íaco"],
73
+ # prática -> prático
74
+ ["ica",3,"ico",["dica"]],
75
+ # cansada -> cansado
76
+ ["ada",2,"ado",["pitada"]],
77
+ # mantida -> mantido
78
+ ["ida",3,"ido",["vida","dúvida"]],
79
+ ["ída",3,"ido",["recaída","saída"]],
80
+ # prima -> primo
81
+ ["ima",3,"imo",["vítima"]],
82
+ # passiva -> passivo
83
+ ["iva",3,"ivo",["saliva","oliva"]],
84
+ # primeira -> primeiro
85
+ ["eira",3,"eiro",["beira","cadeira","frigideira","bandeira","feira","capoeira","barreira","fronteira","besteira","poeira"]]
86
+ ]
87
+ },
88
+
89
+ # Step 4: Augmentative/Diminutive Reduction
90
+ {
91
+ :properties => { name:"augmentative_reduction", size:0, exceptions:1 },
92
+ :rules => [
93
+ # cansadíssimo -> cansad
94
+ ["díssimo",5],
95
+ # amabilíssimo -> ama
96
+ ["abilíssimo",5],
97
+ # fortíssimo -> fort
98
+ ["íssimo",3],
99
+ ["ésimo",3],
100
+ # chiquérrimo -> chiqu
101
+ ["érrimo",4],
102
+ # pezinho -> pe
103
+ ["zinho",2],
104
+ # maluquinho -> maluc
105
+ ["quinho",4,"c"],
106
+ # amiguinho -> amig
107
+ ["uinho",4],
108
+ # cansadinho -> cansad
109
+ ["adinho",3],
110
+ # carrinho -> carr
111
+ ["inho",3,"",["caminho","cominho"]],
112
+ # grandalhão -> grand
113
+ ["alhão",4],
114
+ # dentuça -> dent
115
+ ["uça",4],
116
+ # ricaço -> ric
117
+ ["aço",4,"",["antebraço"]],
118
+ ["aça",4],
119
+ # casadão -> cans
120
+ ["adão",4],
121
+ ["idão",4],
122
+ # corpázio -> corp
123
+ ["ázio",3,"",["topázio"]],
124
+ # pratarraz -> prat
125
+ ["arraz",4],
126
+ ["zarrão",3],
127
+ ["arrão",4],
128
+ # bocarra -> boc
129
+ ["arra",3],
130
+ # calorzão -> calor
131
+ ["zão",2,"",["coalizão"]],
132
+ # meninão -> menin
133
+ ["ão",3,"",["camarão","chimarrão","canção","coração","embrião","grotão","glutão","ficção","fogão","feição","furacão","gamão","lampião","leão","macacão","nação","órfão","orgão","patrão","portão","quinhão","rincão","tração","falcão","espião","mamão","folião","cordão","aptidão","campeão","colchão","limão","leilão","melão","barão","milhão","bilhão","fusão","cristão","ilusão","capitão","estação","senão"]]
134
+ ]
135
+ },
136
+
137
+ # Step 5: Noun Suffix Reduction
138
+ {
139
+ :properties => { name:"noun_reduction", size:0, exceptions:0 },
140
+ :rules => [
141
+ # existencialista -> exist
142
+ ["encialista",4],
143
+ # minimalista -> minim
144
+ ["alista",5],
145
+ # contagem -> cont
146
+ ["agem",3,"",["coragem","chantagem","vantagem","carruagem"]],
147
+ # gerenciamento -> gerenc
148
+ ["iamento",4],
149
+ # monitoramento -> monitor
150
+ ["amento",3,"",["firmamento","fundamento","departamento"]],
151
+ # nascimento -> nasc
152
+ ["imento",3],
153
+ ["mento",6,"",["firmamento","elemento","complemento","instrumento","departamento"]],
154
+ # comercializado -> comerci
155
+ ["alizado",4],
156
+ # traumatizado -> traum
157
+ ["atizado",4],
158
+ ["tizado",4,"",["alfabetizado"]],
159
+ # alfabetizado -> alfabet
160
+ ["izado",5,"",["organizado","pulverizado"]],
161
+ # associativo -> associ
162
+ ["ativo",4,"",["pejorativo","relativo"]],
163
+ # contraceptivo -> contracep
164
+ ["tivo",4,"",["relativo"]],
165
+ # esportivo -> esport
166
+ ["ivo",4,"",["passivo","possessivo","pejorativo","positivo"]],
167
+ # abalado -> abal
168
+ ["ado",2,"",["grado"]],
169
+ # impedido -> imped
170
+ ["ido",3,"",["cândido","consolido","rápido","decido","tímido","duvido","marido"]],
171
+ # ralador -> ral
172
+ ["ador",3],
173
+ # entendedor -> entend
174
+ ["edor",3],
175
+ # cumpridor -> cumpr
176
+ ["idor",4,"",["ouvidor"]],
177
+ ["dor",4,"",["ouvidor"]],
178
+ ["sor",4,"",["assessor"]],
179
+ ["atoria",5],
180
+ ["tor",3,"",["benfeitor","leitor","editor","pastor","produtor","promotor","consultor"]],
181
+ ["or",2,"",["motor","melhor","redor","rigor","sensor","tambor","tumor","assessor","benfeitor","pastor","terior","favor","autor"]],
182
+ # comparabilidade -> compar
183
+ ["abilidade",5],
184
+ # abolicionista -> abol
185
+ ["icionista",4],
186
+ # intervencionista -> interven
187
+ ["cionista",5],
188
+ ["ionista",5],
189
+ ["ionar",5],
190
+ # profissional -> profiss
191
+ ["ional",4],
192
+ # referência -> refer
193
+ ["ência",3],
194
+ # repugnância -> repugn
195
+ ["ância",4,"",["ambulância"]],
196
+ # abatedouro -> abat
197
+ ["edouro",3],
198
+ # fofoqueiro -> fofoc
199
+ ["queiro",3,"c"],
200
+ ["adeiro",4,"",["desfiladeiro"]],
201
+ # brasileiro -> brasil
202
+ ["eiro",3,"",["desfiladeiro","pioneiro","mosteiro"]],
203
+ ["uoso",3],
204
+ # gostoso -> gost
205
+ ["oso",3,"",["precioso"]],
206
+ # comercializaç -> comerci
207
+ ["alizaç",5],
208
+ ["atizaç",5],
209
+ ["tizaç",5],
210
+ ["izaç",5,"",["organizaç"]],
211
+ # alegaç -> aleg
212
+ ["aç",3,"",["equaç","relaç"]],
213
+ # aboliç -> abol
214
+ ["iç",3,"",["eleiç"]],
215
+ # anedotário -> anedot
216
+ ["ário",3,"",["voluntário","salário","aniversário","diário","lionário","armário"]],
217
+ ["atório",3],
218
+ ["rio",5,"",["voluntário","salário","aniversário","diário","compulsório","lionário","próprio","stério","armário"]],
219
+ # ministério -> minist
220
+ ["ério",6],
221
+ # chinês -> chin
222
+ ["ês",4],
223
+ # beleza -> bel
224
+ ["eza",3],
225
+ # rigidez -> rigid
226
+ ["ez",4],
227
+ # parentesco -> parent
228
+ ["esco",4],
229
+ # ocupante -> ocup
230
+ ["ante",2,"",["gigante","elefante","adiante","possante","instante","restaurante"]],
231
+ # bombástico -> bomb
232
+ ["ástico",4,"",["eclesiástico"]],
233
+ ["alístico",3],
234
+ ["áutico",4],
235
+ ["êutico",4],
236
+ ["tico",3,"",["político","eclesiástico","diagnostico","prático","doméstico","diagnóstico","idêntico","alopático","artístico","autêntico","eclético","crítico","critico"]],
237
+ # polêmico -> polêm
238
+ ["ico",4,"",["tico","público","explico"]],
239
+ # produtividade -> produt
240
+ ["ividade",5],
241
+ # profundidade -> profund
242
+ ["idade",4,"",["autoridade","comunidade"]],
243
+ # aposentadoria -> aposentad
244
+ ["oria",4,"",["categoria"]],
245
+ # existencial -> exist
246
+ ["encial",5],
247
+ # artista -> art
248
+ ["ista",4],
249
+ ["auta",5],
250
+ # maluquice -> maluc
251
+ ["quice",4,"c"],
252
+ # chatice -> chat
253
+ ["ice",4,"",["cúmplice"]],
254
+ # demoníaco -> demon
255
+ ["íaco",3],
256
+ # decorrente -> decorr
257
+ ["ente",4,"",["freqüente","alimente","acrescente","permanente","oriente","aparente"]],
258
+ ["ense",5],
259
+ # criminal -> crim
260
+ ["inal",3],
261
+ # americano -> americ
262
+ ["ano",4],
263
+ # amável -> am
264
+ ["ável",2,"",["afável","razoável","potável","vulnerável"]],
265
+ # combustível -> combust
266
+ ["ível",3,"",["possível"]],
267
+ ["vel",5,"",["possível","vulnerável","solúvel"]],
268
+ ["bil",3,"vel"],
269
+ # cobertura -> cobert
270
+ ["ura",4,"",["imatura","acupuntura","costura"]],
271
+ ["ural",4],
272
+ # consensual -> consens
273
+ ["ual",3,"",["bissexual","virtual","visual","pontual"]],
274
+ # mundial -> mund
275
+ ["ial",3],
276
+ # experimental -> experiment
277
+ ["al",4,"",["afinal","animal","estatal","bissexual","desleal","fiscal","formal","pessoal","liberal","postal","virtual","visual","pontual","sideral","sucursal"]],
278
+ ["alismo",4],
279
+ ["ivismo",4],
280
+ ["ismo",3,"",["cinismo"]]
281
+ ]
282
+ },
283
+
284
+ # Step 6: Verb Suffix Reduction
285
+ {
286
+ :properties => { name:"verb_reduction", size:0, exceptions:0 },
287
+ :rules => [
288
+ # cantaríamo -> cant
289
+ ["aríamo",2],
290
+ # cantássemo -> cant
291
+ ["ássemo",2],
292
+ # beberíamo -> beb
293
+ ["eríamo",2],
294
+ # bebêssemo -> beb
295
+ ["êssemo",2],
296
+ # partiríamo -> part
297
+ ["iríamo",3],
298
+ # partíssemo -> part
299
+ ["íssemo",3],
300
+ # cantáramo -> cant
301
+ ["áramo",2],
302
+ # cantárei -> cant
303
+ ["árei",2],
304
+ # cantaremo -> cant
305
+ ["aremo",2],
306
+ # cantariam -> cant
307
+ ["ariam",2],
308
+ # cantaríei -> cant
309
+ ["aríei",2],
310
+ # cantássei -> cant
311
+ ["ássei",2],
312
+ # cantassem -> cant
313
+ ["assem",2],
314
+ # cantávamo -> cant
315
+ ["ávamo",2],
316
+ # bebêramo -> beb
317
+ ["êramo",3],
318
+ # beberemo -> beb
319
+ ["eremo",3],
320
+ # beberiam -> beb
321
+ ["eriam",3],
322
+ # beberíei -> beb
323
+ ["eríei",3],
324
+ # bebêssei -> beb
325
+ ["êssei",3],
326
+ # bebessem -> beb
327
+ ["essem",3],
328
+ # partiríamo -> part
329
+ ["íramo",3],
330
+ # partiremo -> part
331
+ ["iremo",3],
332
+ # partiriam -> part
333
+ ["iriam",3],
334
+ # partiríei -> part
335
+ ["iríei",3],
336
+ # partíssei -> part
337
+ ["íssei",3],
338
+ # partissem -> part
339
+ ["issem",3],
340
+ # cantando -> cant
341
+ ["ando",2],
342
+ # bebendo -> beb
343
+ ["endo",3],
344
+ # partindo -> part
345
+ ["indo",3],
346
+ # propondo -> prop
347
+ ["ondo",3],
348
+ # cantaram -> cant
349
+ ["aram",2],
350
+ ["arão",2],
351
+ # cantarde -> cant
352
+ ["arde",2],
353
+ # cantarei -> cant
354
+ ["arei",2],
355
+ # cantarem -> cant
356
+ ["arem",2],
357
+ # cantaria -> cant
358
+ ["aria",2],
359
+ # cantarmo -> cant
360
+ ["armo",2],
361
+ # cantasse -> cant
362
+ ["asse",2],
363
+ # cantaste -> cant
364
+ ["aste",2],
365
+ # cantavam -> cant
366
+ ["avam",2,"",["agravam"]],
367
+ # cantávei -> cant
368
+ ["ávei",2],
369
+ # beberam -> beb
370
+ ["eram",3],
371
+ ["erão",3],
372
+ # beberde -> beb
373
+ ["erde",3],
374
+ # beberei -> beb
375
+ ["erei",3],
376
+ # bebêrei -> beb
377
+ ["êrei",3],
378
+ # beberem -> beb
379
+ ["erem",3],
380
+ # beberia -> beb
381
+ ["eria",3],
382
+ # bebermo -> beb
383
+ ["ermo",3],
384
+ # bebesse -> beb
385
+ ["esse",3],
386
+ # bebeste -> beb
387
+ ["este",3,"",["faroeste","agreste"]],
388
+ # bebíamo -> beb
389
+ ["íamo",3],
390
+ # partiram -> part
391
+ ["iram",3],
392
+ # concluíram -> conclu
393
+ ["íram",3],
394
+ ["irão",2],
395
+ # partirde -> part
396
+ ["irde",2],
397
+ # partírei -> part
398
+ ["irei",3,"",["admirei"]],
399
+ # partirem -> part
400
+ ["irem",3,"",["adquirem"]],
401
+ # partiria -> part
402
+ ["iria",3],
403
+ # partirmo -> part
404
+ ["irmo",3],
405
+ # partisse -> part
406
+ ["isse",3],
407
+ # partiste -> part
408
+ ["iste",4],
409
+ ["iava",4,"",["ampliava"]],
410
+ # cantamo -> cant
411
+ ["amo",2],
412
+ ["iona",3],
413
+ # cantara -> cant
414
+ ["ara",2,"",["arara","prepara"]],
415
+ # cantará -> cant
416
+ ["ará",2,"",["alvará"]],
417
+ # cantare -> cant
418
+ ["are",2,"",["prepare"]],
419
+ # cantava -> cant
420
+ ["ava",2,"",["agrava"]],
421
+ # cantemo -> cant
422
+ ["emo",2],
423
+ # bebera -> beb
424
+ ["era",3,"",["acelera","espera"]],
425
+ # beberá -> beb
426
+ ["erá",3],
427
+ # bebere -> beb
428
+ ["ere",3,"",["espere"]],
429
+ # bebiam -> beb
430
+ ["iam",3,"",["enfiam","ampliam","elogiam","ensaiam"]],
431
+ # bebíei -> beb
432
+ ["íei",3],
433
+ # partimo -> part
434
+ ["imo",3,"",["reprimo","intimo","íntimo","nimo","queimo","ximo"]],
435
+ # partira -> part
436
+ ["ira",3,"",["fronteira","sátira"]],
437
+ ["ído",3],
438
+ # partirá -> part
439
+ ["irá",3],
440
+ ["tizar",4,"",["alfabetizar"]],
441
+ ["izar",5,"",["organizar"]],
442
+ ["itar",5,"",["acreditar","explicitar","estreitar"]],
443
+ # partire -> part
444
+ ["ire",3,"",["adquire"]],
445
+ # compomo -> comp
446
+ ["omo",3],
447
+ # cantai -> cant
448
+ ["ai",2],
449
+ # cantam -> cant
450
+ ["am",2],
451
+ # barbear -> barb
452
+ ["ear",4,"",["alardear","nuclear"]],
453
+ # cantar -> cant
454
+ ["ar",2,"",["azar","bazaar","patamar"]],
455
+ # cheguei -> cheg
456
+ ["uei",3],
457
+ ["uía",5,"u"],
458
+ # cantei -> cant
459
+ ["ei",3],
460
+ ["guem",3,"g"],
461
+ # cantem -> cant
462
+ ["em",2,"",["alem","virgem"]],
463
+ # beber -> beb
464
+ ["er",2,"",["éter","pier"]],
465
+ # bebeu -> beb
466
+ ["eu",3,"",["chapeu"]],
467
+ # bebia -> beb
468
+ ["ia",3,"",["estória","fatia","acia","praia","elogia","mania","lábia","aprecia","polícia","arredia","cheia","ásia"]],
469
+ # partir -> part
470
+ ["ir",3,"",["freir"]],
471
+ # partiu -> part
472
+ ["iu",3],
473
+ ["eou",5],
474
+ # chegou -> cheg
475
+ ["ou",3],
476
+ # bebi -> beb
477
+ ["i",3]
478
+ ]
479
+ },
480
+
481
+ # Step 7: Vowel Removal
482
+ {
483
+ :properties => { name:"vowel_reduction", size:0, exceptions:0 },
484
+ :rules => [
485
+ ["bil",2,"vel"],
486
+ ["gue",2,"g",["gangue","jegue"]],
487
+ ["á",3],
488
+ ["ê",3,"",["bebê"]],
489
+ # menina -> menin
490
+ ["a",3,"",["ásia"]],
491
+ # grande -> grand
492
+ ["e",3],
493
+ # menino -> menin
494
+ ["o",3,"",["ão"]]
495
+ ]
496
+ },
497
+ # Step 8: Remove accents
498
+ {
499
+ :properties => { name:"accent_reduction", size:0, exceptions:0 },
500
+ :rules => [
501
+ ["á",1,"a"],
502
+ ["â",1,"a"],
503
+ ["ó",1,"o"],
504
+ ["ô",1,"o"],
505
+ ["é",1,"e"],
506
+ ["í",1,"i"],
507
+ ["ú",1,"u"],
508
+ ]
509
+ }
510
+ ]
511
+
512
+ def stemmer
513
+ stemmer = Stemmer.new self.dup.to_s
514
+ stemmer.render
515
+ end
516
+
517
+ class Stemmer
518
+
519
+ def initialize word
520
+ @suffix_removed = false
521
+ @word = word
522
+ @rules = RULES.freeze
523
+
524
+ self
525
+ end
526
+
527
+ def render
528
+ @word.downcase!
529
+
530
+ self.apply_rules_by_name('plural_reduction') if @word[-1] == 's'
531
+ self.apply_rules_by_name('adverb_reduction')
532
+ self.apply_rules_by_name('feminine_reduction') if @word[-1] == 'a' or @word[-1] == "ã"
533
+ self.apply_rules_by_name('augmentative_reduction')
534
+ self.apply_rules_by_name('noun_reduction')
535
+ self.apply_rules_by_name('verb_reduction') unless @suffix_removed
536
+ self.apply_rules_by_name('vowel_reduction') unless @suffix_removed
537
+ self.apply_rules_by_name('accent_reduction')
538
+ end
539
+
540
+ def apply_rules_by_name(name)
541
+ rules = @rules.detect { |rule| rule[:properties][:name] == name }
542
+ rules[:rules].each do |rule|
543
+ if rule[2].nil?
544
+ self.apply_suffix rule[0], rule[1], rule[3]
545
+ else
546
+ self.apply_suffix rule[0], rule[1], rule[2], rule[3]
547
+ end
548
+ end
549
+
550
+ @word
551
+ end
552
+
553
+ # @params:
554
+ # suffix => Suffix to remove
555
+ # size => Minimal size of stem
556
+ # replaced => Replace suffix
557
+ # excpt => Exceptions words or suffix list
558
+ def apply_suffix(suffix, size, replaced='', excpts)
559
+ aux_word = @word
560
+ if @word =~ /#{suffix}$/
561
+ @word.gsub!(/#{suffix}$/, replaced) if
562
+ (not excpts.nil? and excpts.detect { |expt| @word == expt }.nil? or excpts.nil?) and @word.length - suffix.length >= size
563
+ end
564
+
565
+ @suffix_removed = true if aux_word != @word
566
+ end
567
+
568
+ @word
569
+ end
570
+ end
571
+
572
+ class String
573
+ def brstemmer
574
+ word = Brstemmer::Stemmer.new self.dup.to_s
575
+ word.render
576
+ end
577
+ end
578
+
metadata ADDED
@@ -0,0 +1,84 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: brstemmer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Tacnoman
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-03-19 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.11'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.11'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ description: Stemmer algorithm in pt-BR
42
+ email:
43
+ - renatocassino@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".gitignore"
49
+ - CODE_OF_CONDUCT.md
50
+ - Gemfile
51
+ - LICENSE.txt
52
+ - README.md
53
+ - Rakefile
54
+ - bin/console
55
+ - bin/setup
56
+ - brstemmer-0.1.0.gem
57
+ - brstemmer.gemspec
58
+ - lib/brstemmer.rb
59
+ - lib/brstemmer/version.rb
60
+ homepage: https://github.com/tacnoman/brstemmer.git
61
+ licenses:
62
+ - MIT
63
+ metadata: {}
64
+ post_install_message:
65
+ rdoc_options: []
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: '0'
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ requirements: []
79
+ rubyforge_project:
80
+ rubygems_version: 2.4.8
81
+ signing_key:
82
+ specification_version: 4
83
+ summary: Helper to make a stemmer algorithm in pt-BR.
84
+ test_files: []