pascoale 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/README.md +32 -4
- data/data/errors.txt +1124 -0
- data/data/everything.txt +177302 -0
- data/data/unique_errors.txt +957 -0
- data/lib/pascoale/constants.rb +8 -0
- data/lib/pascoale/edits.rb +1 -1
- data/lib/pascoale/syllable_separator.rb +44 -0
- data/lib/pascoale/syllable_separator_benchmark.rb +29 -0
- data/lib/pascoale/version.rb +1 -1
- data/lib/pascoale.rb +8 -3
- data/pascoale.gemspec +1 -0
- data/spec/lib/pascoale/syllable_separator_spec.rb +150 -0
- metadata +38 -14
data/lib/pascoale/edits.rb
CHANGED
@@ -0,0 +1,44 @@
|
|
1
|
+
module Pascoale
|
2
|
+
class SyllableSeparator
|
3
|
+
include Constants
|
4
|
+
|
5
|
+
ONSET = "(?:ch|lh|nh|gu|qu|[pbtdcgfv][lr]|[#{CONSONANTS}])"
|
6
|
+
|
7
|
+
# Still in doubt if we should add suffixes to the "i" semivowel...
|
8
|
+
# it slightly improves the the matches, but some of them causes more
|
9
|
+
# noise than fix things =\
|
10
|
+
#NUCLEUS = "(?:ãe|ão|õe|[#{VOWELS}](?:u|i(?!nh|r$|m$|dora?$|ção$|dade$))?)"
|
11
|
+
NUCLEUS = "(?:ãe|ão|õe|[#{VOWELS}](?:u|i(?!nh|r$|m$|ção$|dora?$))?)"
|
12
|
+
|
13
|
+
CODA = "[#{CONSONANTS}]"
|
14
|
+
|
15
|
+
# The concept of "rhyme" does not help in this algorithm. It seems the
|
16
|
+
# concept makes no sense for syllable separation in portuguese
|
17
|
+
KERNEL = "#{ONSET}?#{NUCLEUS}"
|
18
|
+
|
19
|
+
def initialize(word)
|
20
|
+
@word = word
|
21
|
+
end
|
22
|
+
|
23
|
+
def separated
|
24
|
+
rest = @word
|
25
|
+
result = []
|
26
|
+
while rest && rest.size > 0
|
27
|
+
if rest =~ /^(#{KERNEL})(?:(#{KERNEL})|(#{CODA})(#{KERNEL})|(#{CODA}#{CODA})(#{KERNEL})|(#{CODA}#{CODA})|(#{CODA}))?(.*)$/
|
28
|
+
result << $1 + $3.to_s + $5.to_s + $7.to_s + $8.to_s
|
29
|
+
rest = $2.to_s + $4.to_s + $6.to_s + $9.to_s
|
30
|
+
# Special case! Hate them :(
|
31
|
+
# Pneu, Gnomo, Mnemônica, Pseudônimo
|
32
|
+
elsif result.size == 0
|
33
|
+
if rest =~ /^([#{CONSONANTS}]#{KERNEL})(?:(#{KERNEL})|(#{CODA})(#{KERNEL})|(#{CODA}#{CODA})(#{KERNEL})|(#{CODA}#{CODA})|(#{CODA}))?(.*)$/
|
34
|
+
result << $1 + $3.to_s + $5.to_s + $7.to_s + $8.to_s
|
35
|
+
rest = $2.to_s + $4.to_s + $6.to_s + $9.to_s
|
36
|
+
end
|
37
|
+
else
|
38
|
+
raise %(Cannot separate "#{@word}". No rule match next syllable at "#{result.join('')}|>#{rest}")
|
39
|
+
end
|
40
|
+
end
|
41
|
+
result
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'pascoale'
|
2
|
+
|
3
|
+
correct_counter = 0
|
4
|
+
wrong_counter = 0
|
5
|
+
word = nil
|
6
|
+
|
7
|
+
begin
|
8
|
+
open("#{Pascoale.root}/data/everything.txt") do |file|
|
9
|
+
file.each_line do |line|
|
10
|
+
begin
|
11
|
+
word, _, separation = eval(line)
|
12
|
+
next if word =~ /\-/
|
13
|
+
s = Pascoale::SyllableSeparator.new(word).separated
|
14
|
+
if s == separation
|
15
|
+
correct_counter += 1
|
16
|
+
else
|
17
|
+
wrong_counter += 1
|
18
|
+
puts "#{word} - #{s} - #{separation}"
|
19
|
+
end
|
20
|
+
rescue => e
|
21
|
+
puts e.message
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
ensure
|
26
|
+
puts " Last: #{word}"
|
27
|
+
puts "Correct: #{correct_counter}"
|
28
|
+
puts " Wrong: #{wrong_counter}"
|
29
|
+
end
|
data/lib/pascoale/version.rb
CHANGED
data/lib/pascoale.rb
CHANGED
@@ -1,5 +1,10 @@
|
|
1
|
-
require 'pascoale/version'
|
2
|
-
require 'pascoale/edits'
|
3
|
-
|
4
1
|
module Pascoale
|
2
|
+
def self.root
|
3
|
+
File.expand_path("#{File.dirname(__FILE__)}/..")
|
4
|
+
end
|
5
5
|
end
|
6
|
+
|
7
|
+
require 'pascoale/version'
|
8
|
+
require 'pascoale/constants'
|
9
|
+
require 'pascoale/edits'
|
10
|
+
require 'pascoale/syllable_separator'
|
data/pascoale.gemspec
CHANGED
@@ -0,0 +1,150 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec::Matchers.define :separate_as do |expected|
|
4
|
+
result = nil
|
5
|
+
match do |actual|
|
6
|
+
result = Pascoale::SyllableSeparator.new(actual).separated
|
7
|
+
result == expected
|
8
|
+
end
|
9
|
+
failure_message do |actual|
|
10
|
+
%( expected "#{actual}" to separate as "#{expected}", but was "#{result}")
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
describe Pascoale::SyllableSeparator do
|
15
|
+
it 'separates simple words' do
|
16
|
+
expect('bola').to separate_as %w(bo la)
|
17
|
+
expect('batata').to separate_as %w(ba ta ta)
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'keeps some "dígrafos" together' do
|
21
|
+
expect('chocalho').to separate_as %w(cho ca lho)
|
22
|
+
expect('batuque').to separate_as %w(ba tu que)
|
23
|
+
expect('guelha').to separate_as %w(gue lha)
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'separates other "dígrafos"' do
|
27
|
+
expect('bossa').to separate_as %w(bos sa)
|
28
|
+
expect('bosta').to separate_as %w(bos ta)
|
29
|
+
expect('cassado').to separate_as %w(cas sa do)
|
30
|
+
expect('carrasco').to separate_as %w(car ras co)
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'handles double consonants border cases' do
|
34
|
+
expect('destravar').to separate_as %w(des tra var)
|
35
|
+
expect('desencontro').to separate_as %w(de sen con tro)
|
36
|
+
expect('exaltar').to separate_as %w(e xal tar)
|
37
|
+
expect('excomungar').to separate_as %w(ex co mun gar)
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'keeps "oclusivas" and "fricativas" together with "r" or "l"' do
|
41
|
+
expect('brasa').to separate_as %w(bra sa)
|
42
|
+
expect('agrupado').to separate_as %w(a gru pa do)
|
43
|
+
expect('fragrância').to separate_as %w(fra grân ci a)
|
44
|
+
expect('protocriogênico').to separate_as %w(pro to cri o gê ni co)
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'keeps long codas' do
|
48
|
+
expect('transpiração').to separate_as %w(trans pi ra ção)
|
49
|
+
expect('transatlântico').to separate_as %w(tran sa tlân ti co)
|
50
|
+
expect('mirins').to separate_as %w(mi rins)
|
51
|
+
expect('transerrano').to separate_as %w(tran ser ra no)
|
52
|
+
expect('solstício').to separate_as %w(sols tí ci o)
|
53
|
+
expect('perspectiva').to separate_as %w(pers pec ti va)
|
54
|
+
expect('substância').to separate_as %w(subs tân ci a)
|
55
|
+
# Yes! It's a real word o_O
|
56
|
+
expect('falansterialismo').to separate_as %w(fa lans te ri a lis mo)
|
57
|
+
end
|
58
|
+
|
59
|
+
it 'separates single vowels at beggining' do
|
60
|
+
expect('abacaxi').to separate_as %w(a ba ca xi)
|
61
|
+
expect('exceto').to separate_as %w(ex ce to)
|
62
|
+
expect('arrocho').to separate_as %w(ar ro cho)
|
63
|
+
end
|
64
|
+
|
65
|
+
it 'keeps "ditongos" together' do
|
66
|
+
expect('maizena').to separate_as %w(mai ze na)
|
67
|
+
expect('mausoléu').to separate_as %w(mau so léu)
|
68
|
+
expect('ação').to separate_as %w(a ção)
|
69
|
+
expect('põe').to separate_as %w(põe)
|
70
|
+
expect('exceção').to separate_as %w(ex ce ção)
|
71
|
+
end
|
72
|
+
|
73
|
+
it 'handle "tritongos"' do
|
74
|
+
expect('ideia').to separate_as %w(i dei a)
|
75
|
+
expect('tireoide').to separate_as %w(ti re oi de)
|
76
|
+
expect('praia').to separate_as %w(prai a)
|
77
|
+
expect('feio').to separate_as %w(fei o)
|
78
|
+
expect('vaia').to separate_as %w(vai a)
|
79
|
+
end
|
80
|
+
|
81
|
+
it 'separates "hiatos"' do
|
82
|
+
expect('moeda').to separate_as %w(mo e da)
|
83
|
+
expect('leal').to separate_as %w(le al)
|
84
|
+
expect('aéreo').to separate_as %w(a é re o)
|
85
|
+
expect('pior').to separate_as %w(pi or)
|
86
|
+
expect('raíz').to separate_as %w(ra íz)
|
87
|
+
expect('ruído').to separate_as %w(ru í do)
|
88
|
+
end
|
89
|
+
|
90
|
+
it 'keeps the first consonant of the word together' do
|
91
|
+
expect('pneu').to separate_as %w(pneu)
|
92
|
+
expect('apneia').to separate_as %w(ap nei a)
|
93
|
+
expect('pneumático').to separate_as %w(pneu má ti co)
|
94
|
+
expect('piropneumático').to separate_as %w(pi rop neu má ti co)
|
95
|
+
expect('mnemônica').to separate_as %w(mne mô ni ca)
|
96
|
+
expect('pseudônimo').to separate_as %w(pseu dô ni mo)
|
97
|
+
expect('gnomo').to separate_as %w(gno mo)
|
98
|
+
end
|
99
|
+
|
100
|
+
it 'keeps "sineréses" together' do
|
101
|
+
expect('saudade').to separate_as %w(sau da de)
|
102
|
+
expect('vaidade').to separate_as %w(vai da de)
|
103
|
+
expect('suave').to separate_as %w(su a ve)
|
104
|
+
|
105
|
+
# Not sure how to deal with these
|
106
|
+
#expect('traidor').to separate_as %w(trai dor)
|
107
|
+
end
|
108
|
+
|
109
|
+
it 'separates "dieréses"' do
|
110
|
+
expect('conluiador').to separate_as %w(con lui a dor)
|
111
|
+
expect('conluio').to separate_as %w(con lui o)
|
112
|
+
expect('aleluia').to separate_as %w(a le lui a)
|
113
|
+
expect('toluico').to separate_as %w(to lui co)
|
114
|
+
|
115
|
+
expect('alauita').to separate_as %w(a lau i ta)
|
116
|
+
|
117
|
+
expect('rainha').to separate_as %w(ra i nha)
|
118
|
+
expect('tainheira').to separate_as %w(ta i nhei ra)
|
119
|
+
|
120
|
+
expect('construir').to separate_as %w(cons tru ir)
|
121
|
+
expect('destruir').to separate_as %w(des tru ir)
|
122
|
+
expect('destruição').to separate_as %w(des tru i ção)
|
123
|
+
|
124
|
+
#expect('acuidade').to separate_as %w(a cu i da de)
|
125
|
+
#expect('ajuizar').to separate_as %w(a ju i zar)
|
126
|
+
end
|
127
|
+
|
128
|
+
it 'separates random words' do
|
129
|
+
expect('acidentariamente').to separate_as %w(a ci den ta ri a men te)
|
130
|
+
expect('cooperar').to separate_as %w(co o pe rar)
|
131
|
+
expect('abstraído').to separate_as %w(abs tra í do)
|
132
|
+
expect('abstenção').to separate_as %w(abs ten ção)
|
133
|
+
expect('colapso').to separate_as %w(co lap so)
|
134
|
+
expect('piauí').to separate_as %w(pi au í)
|
135
|
+
expect('aguei').to separate_as %w(a guei)
|
136
|
+
expect('compreender').to separate_as %w(com pre en der)
|
137
|
+
expect('caatinga').to separate_as %w(ca a tin ga)
|
138
|
+
expect('atmosfera').to separate_as %w(at mos fe ra)
|
139
|
+
|
140
|
+
expect('tuiuiú').to separate_as %w(tui ui ú)
|
141
|
+
|
142
|
+
# I really don't buy the whole "comes from latin" thing.
|
143
|
+
# Our separation if phonetic based, so, keep it that way!
|
144
|
+
expect('abrupto').to separate_as %w(a brup to)
|
145
|
+
expect('abruptamente').to separate_as %w(a brup ta men te)
|
146
|
+
# For example, the word bellow is correctly separated
|
147
|
+
# (as the dictionary says). =\
|
148
|
+
expect('abrupção').to separate_as %w(a brup ção)
|
149
|
+
end
|
150
|
+
end
|
metadata
CHANGED
@@ -1,61 +1,75 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pascoale
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ronie Uliana
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-04-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.5'
|
20
|
-
- -
|
20
|
+
- - ">="
|
21
21
|
- !ruby/object:Gem::Version
|
22
22
|
version: 1.5.1
|
23
23
|
type: :development
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
27
|
-
- - ~>
|
27
|
+
- - "~>"
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: '1.5'
|
30
|
-
- -
|
30
|
+
- - ">="
|
31
31
|
- !ruby/object:Gem::Version
|
32
32
|
version: 1.5.1
|
33
33
|
- !ruby/object:Gem::Dependency
|
34
34
|
name: rake
|
35
35
|
requirement: !ruby/object:Gem::Requirement
|
36
36
|
requirements:
|
37
|
-
- -
|
37
|
+
- - ">="
|
38
38
|
- !ruby/object:Gem::Version
|
39
39
|
version: '0'
|
40
40
|
type: :development
|
41
41
|
prerelease: false
|
42
42
|
version_requirements: !ruby/object:Gem::Requirement
|
43
43
|
requirements:
|
44
|
-
- -
|
44
|
+
- - ">="
|
45
45
|
- !ruby/object:Gem::Version
|
46
46
|
version: '0'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: rspec
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - "~>"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 3.0.beta
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - "~>"
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 3.0.beta
|
47
61
|
- !ruby/object:Gem::Dependency
|
48
62
|
name: guard-rspec
|
49
63
|
requirement: !ruby/object:Gem::Requirement
|
50
64
|
requirements:
|
51
|
-
- -
|
65
|
+
- - ">="
|
52
66
|
- !ruby/object:Gem::Version
|
53
67
|
version: '0'
|
54
68
|
type: :development
|
55
69
|
prerelease: false
|
56
70
|
version_requirements: !ruby/object:Gem::Requirement
|
57
71
|
requirements:
|
58
|
-
- -
|
72
|
+
- - ">="
|
59
73
|
- !ruby/object:Gem::Version
|
60
74
|
version: '0'
|
61
75
|
description:
|
@@ -65,17 +79,26 @@ executables: []
|
|
65
79
|
extensions: []
|
66
80
|
extra_rdoc_files: []
|
67
81
|
files:
|
68
|
-
- .gitignore
|
82
|
+
- ".gitignore"
|
83
|
+
- ".ruby-gemset"
|
84
|
+
- ".ruby-version"
|
69
85
|
- Gemfile
|
70
86
|
- Guardfile
|
71
87
|
- LICENSE.txt
|
72
88
|
- README.md
|
73
89
|
- Rakefile
|
90
|
+
- data/errors.txt
|
91
|
+
- data/everything.txt
|
92
|
+
- data/unique_errors.txt
|
74
93
|
- lib/pascoale.rb
|
94
|
+
- lib/pascoale/constants.rb
|
75
95
|
- lib/pascoale/edits.rb
|
96
|
+
- lib/pascoale/syllable_separator.rb
|
97
|
+
- lib/pascoale/syllable_separator_benchmark.rb
|
76
98
|
- lib/pascoale/version.rb
|
77
99
|
- pascoale.gemspec
|
78
100
|
- spec/lib/pascoale/edits_spec.rb
|
101
|
+
- spec/lib/pascoale/syllable_separator_spec.rb
|
79
102
|
- spec/spec_helper.rb
|
80
103
|
homepage: http://github.com/ruliana/pascoale
|
81
104
|
licenses:
|
@@ -87,20 +110,21 @@ require_paths:
|
|
87
110
|
- lib
|
88
111
|
required_ruby_version: !ruby/object:Gem::Requirement
|
89
112
|
requirements:
|
90
|
-
- -
|
113
|
+
- - ">="
|
91
114
|
- !ruby/object:Gem::Version
|
92
115
|
version: '0'
|
93
116
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
94
117
|
requirements:
|
95
|
-
- -
|
118
|
+
- - ">="
|
96
119
|
- !ruby/object:Gem::Version
|
97
120
|
version: '0'
|
98
121
|
requirements: []
|
99
122
|
rubyforge_project:
|
100
|
-
rubygems_version: 2.
|
123
|
+
rubygems_version: 2.2.0
|
101
124
|
signing_key:
|
102
125
|
specification_version: 4
|
103
126
|
summary: Text processing utilities for Brazilian Portuguese
|
104
127
|
test_files:
|
105
128
|
- spec/lib/pascoale/edits_spec.rb
|
129
|
+
- spec/lib/pascoale/syllable_separator_spec.rb
|
106
130
|
- spec/spec_helper.rb
|