pascoale 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/README.md +32 -4
- data/data/errors.txt +1124 -0
- data/data/everything.txt +177302 -0
- data/data/unique_errors.txt +957 -0
- data/lib/pascoale/constants.rb +8 -0
- data/lib/pascoale/edits.rb +1 -1
- data/lib/pascoale/syllable_separator.rb +44 -0
- data/lib/pascoale/syllable_separator_benchmark.rb +29 -0
- data/lib/pascoale/version.rb +1 -1
- data/lib/pascoale.rb +8 -3
- data/pascoale.gemspec +1 -0
- data/spec/lib/pascoale/syllable_separator_spec.rb +150 -0
- metadata +38 -14
data/lib/pascoale/edits.rb
CHANGED
@@ -0,0 +1,44 @@
|
|
1
|
+
module Pascoale
|
2
|
+
class SyllableSeparator
|
3
|
+
include Constants
|
4
|
+
|
5
|
+
ONSET = "(?:ch|lh|nh|gu|qu|[pbtdcgfv][lr]|[#{CONSONANTS}])"
|
6
|
+
|
7
|
+
# Still in doubt if we should add suffixes to the "i" semivowel...
|
8
|
+
# it slightly improves the the matches, but some of them causes more
|
9
|
+
# noise than fix things =\
|
10
|
+
#NUCLEUS = "(?:ãe|ão|õe|[#{VOWELS}](?:u|i(?!nh|r$|m$|dora?$|ção$|dade$))?)"
|
11
|
+
NUCLEUS = "(?:ãe|ão|õe|[#{VOWELS}](?:u|i(?!nh|r$|m$|ção$|dora?$))?)"
|
12
|
+
|
13
|
+
CODA = "[#{CONSONANTS}]"
|
14
|
+
|
15
|
+
# The concept of "rhyme" does not help in this algorithm. It seems the
|
16
|
+
# concept makes no sense for syllable separation in portuguese
|
17
|
+
KERNEL = "#{ONSET}?#{NUCLEUS}"
|
18
|
+
|
19
|
+
def initialize(word)
|
20
|
+
@word = word
|
21
|
+
end
|
22
|
+
|
23
|
+
def separated
|
24
|
+
rest = @word
|
25
|
+
result = []
|
26
|
+
while rest && rest.size > 0
|
27
|
+
if rest =~ /^(#{KERNEL})(?:(#{KERNEL})|(#{CODA})(#{KERNEL})|(#{CODA}#{CODA})(#{KERNEL})|(#{CODA}#{CODA})|(#{CODA}))?(.*)$/
|
28
|
+
result << $1 + $3.to_s + $5.to_s + $7.to_s + $8.to_s
|
29
|
+
rest = $2.to_s + $4.to_s + $6.to_s + $9.to_s
|
30
|
+
# Special case! Hate them :(
|
31
|
+
# Pneu, Gnomo, Mnemônica, Pseudônimo
|
32
|
+
elsif result.size == 0
|
33
|
+
if rest =~ /^([#{CONSONANTS}]#{KERNEL})(?:(#{KERNEL})|(#{CODA})(#{KERNEL})|(#{CODA}#{CODA})(#{KERNEL})|(#{CODA}#{CODA})|(#{CODA}))?(.*)$/
|
34
|
+
result << $1 + $3.to_s + $5.to_s + $7.to_s + $8.to_s
|
35
|
+
rest = $2.to_s + $4.to_s + $6.to_s + $9.to_s
|
36
|
+
end
|
37
|
+
else
|
38
|
+
raise %(Cannot separate "#{@word}". No rule match next syllable at "#{result.join('')}|>#{rest}")
|
39
|
+
end
|
40
|
+
end
|
41
|
+
result
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'pascoale'
|
2
|
+
|
3
|
+
correct_counter = 0
|
4
|
+
wrong_counter = 0
|
5
|
+
word = nil
|
6
|
+
|
7
|
+
begin
|
8
|
+
open("#{Pascoale.root}/data/everything.txt") do |file|
|
9
|
+
file.each_line do |line|
|
10
|
+
begin
|
11
|
+
word, _, separation = eval(line)
|
12
|
+
next if word =~ /\-/
|
13
|
+
s = Pascoale::SyllableSeparator.new(word).separated
|
14
|
+
if s == separation
|
15
|
+
correct_counter += 1
|
16
|
+
else
|
17
|
+
wrong_counter += 1
|
18
|
+
puts "#{word} - #{s} - #{separation}"
|
19
|
+
end
|
20
|
+
rescue => e
|
21
|
+
puts e.message
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
ensure
|
26
|
+
puts " Last: #{word}"
|
27
|
+
puts "Correct: #{correct_counter}"
|
28
|
+
puts " Wrong: #{wrong_counter}"
|
29
|
+
end
|
data/lib/pascoale/version.rb
CHANGED
data/lib/pascoale.rb
CHANGED
@@ -1,5 +1,10 @@
|
|
1
|
-
require 'pascoale/version'
|
2
|
-
require 'pascoale/edits'
|
3
|
-
|
4
1
|
module Pascoale
|
2
|
+
def self.root
|
3
|
+
File.expand_path("#{File.dirname(__FILE__)}/..")
|
4
|
+
end
|
5
5
|
end
|
6
|
+
|
7
|
+
require 'pascoale/version'
|
8
|
+
require 'pascoale/constants'
|
9
|
+
require 'pascoale/edits'
|
10
|
+
require 'pascoale/syllable_separator'
|
data/pascoale.gemspec
CHANGED
@@ -0,0 +1,150 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec::Matchers.define :separate_as do |expected|
|
4
|
+
result = nil
|
5
|
+
match do |actual|
|
6
|
+
result = Pascoale::SyllableSeparator.new(actual).separated
|
7
|
+
result == expected
|
8
|
+
end
|
9
|
+
failure_message do |actual|
|
10
|
+
%( expected "#{actual}" to separate as "#{expected}", but was "#{result}")
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
describe Pascoale::SyllableSeparator do
|
15
|
+
it 'separates simple words' do
|
16
|
+
expect('bola').to separate_as %w(bo la)
|
17
|
+
expect('batata').to separate_as %w(ba ta ta)
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'keeps some "dígrafos" together' do
|
21
|
+
expect('chocalho').to separate_as %w(cho ca lho)
|
22
|
+
expect('batuque').to separate_as %w(ba tu que)
|
23
|
+
expect('guelha').to separate_as %w(gue lha)
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'separates other "dígrafos"' do
|
27
|
+
expect('bossa').to separate_as %w(bos sa)
|
28
|
+
expect('bosta').to separate_as %w(bos ta)
|
29
|
+
expect('cassado').to separate_as %w(cas sa do)
|
30
|
+
expect('carrasco').to separate_as %w(car ras co)
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'handles double consonants border cases' do
|
34
|
+
expect('destravar').to separate_as %w(des tra var)
|
35
|
+
expect('desencontro').to separate_as %w(de sen con tro)
|
36
|
+
expect('exaltar').to separate_as %w(e xal tar)
|
37
|
+
expect('excomungar').to separate_as %w(ex co mun gar)
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'keeps "oclusivas" and "fricativas" together with "r" or "l"' do
|
41
|
+
expect('brasa').to separate_as %w(bra sa)
|
42
|
+
expect('agrupado').to separate_as %w(a gru pa do)
|
43
|
+
expect('fragrância').to separate_as %w(fra grân ci a)
|
44
|
+
expect('protocriogênico').to separate_as %w(pro to cri o gê ni co)
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'keeps long codas' do
|
48
|
+
expect('transpiração').to separate_as %w(trans pi ra ção)
|
49
|
+
expect('transatlântico').to separate_as %w(tran sa tlân ti co)
|
50
|
+
expect('mirins').to separate_as %w(mi rins)
|
51
|
+
expect('transerrano').to separate_as %w(tran ser ra no)
|
52
|
+
expect('solstício').to separate_as %w(sols tí ci o)
|
53
|
+
expect('perspectiva').to separate_as %w(pers pec ti va)
|
54
|
+
expect('substância').to separate_as %w(subs tân ci a)
|
55
|
+
# Yes! It's a real word o_O
|
56
|
+
expect('falansterialismo').to separate_as %w(fa lans te ri a lis mo)
|
57
|
+
end
|
58
|
+
|
59
|
+
it 'separates single vowels at beggining' do
|
60
|
+
expect('abacaxi').to separate_as %w(a ba ca xi)
|
61
|
+
expect('exceto').to separate_as %w(ex ce to)
|
62
|
+
expect('arrocho').to separate_as %w(ar ro cho)
|
63
|
+
end
|
64
|
+
|
65
|
+
it 'keeps "ditongos" together' do
|
66
|
+
expect('maizena').to separate_as %w(mai ze na)
|
67
|
+
expect('mausoléu').to separate_as %w(mau so léu)
|
68
|
+
expect('ação').to separate_as %w(a ção)
|
69
|
+
expect('põe').to separate_as %w(põe)
|
70
|
+
expect('exceção').to separate_as %w(ex ce ção)
|
71
|
+
end
|
72
|
+
|
73
|
+
it 'handle "tritongos"' do
|
74
|
+
expect('ideia').to separate_as %w(i dei a)
|
75
|
+
expect('tireoide').to separate_as %w(ti re oi de)
|
76
|
+
expect('praia').to separate_as %w(prai a)
|
77
|
+
expect('feio').to separate_as %w(fei o)
|
78
|
+
expect('vaia').to separate_as %w(vai a)
|
79
|
+
end
|
80
|
+
|
81
|
+
it 'separates "hiatos"' do
|
82
|
+
expect('moeda').to separate_as %w(mo e da)
|
83
|
+
expect('leal').to separate_as %w(le al)
|
84
|
+
expect('aéreo').to separate_as %w(a é re o)
|
85
|
+
expect('pior').to separate_as %w(pi or)
|
86
|
+
expect('raíz').to separate_as %w(ra íz)
|
87
|
+
expect('ruído').to separate_as %w(ru í do)
|
88
|
+
end
|
89
|
+
|
90
|
+
it 'keeps the first consonant of the word together' do
|
91
|
+
expect('pneu').to separate_as %w(pneu)
|
92
|
+
expect('apneia').to separate_as %w(ap nei a)
|
93
|
+
expect('pneumático').to separate_as %w(pneu má ti co)
|
94
|
+
expect('piropneumático').to separate_as %w(pi rop neu má ti co)
|
95
|
+
expect('mnemônica').to separate_as %w(mne mô ni ca)
|
96
|
+
expect('pseudônimo').to separate_as %w(pseu dô ni mo)
|
97
|
+
expect('gnomo').to separate_as %w(gno mo)
|
98
|
+
end
|
99
|
+
|
100
|
+
it 'keeps "sineréses" together' do
|
101
|
+
expect('saudade').to separate_as %w(sau da de)
|
102
|
+
expect('vaidade').to separate_as %w(vai da de)
|
103
|
+
expect('suave').to separate_as %w(su a ve)
|
104
|
+
|
105
|
+
# Not sure how to deal with these
|
106
|
+
#expect('traidor').to separate_as %w(trai dor)
|
107
|
+
end
|
108
|
+
|
109
|
+
it 'separates "dieréses"' do
|
110
|
+
expect('conluiador').to separate_as %w(con lui a dor)
|
111
|
+
expect('conluio').to separate_as %w(con lui o)
|
112
|
+
expect('aleluia').to separate_as %w(a le lui a)
|
113
|
+
expect('toluico').to separate_as %w(to lui co)
|
114
|
+
|
115
|
+
expect('alauita').to separate_as %w(a lau i ta)
|
116
|
+
|
117
|
+
expect('rainha').to separate_as %w(ra i nha)
|
118
|
+
expect('tainheira').to separate_as %w(ta i nhei ra)
|
119
|
+
|
120
|
+
expect('construir').to separate_as %w(cons tru ir)
|
121
|
+
expect('destruir').to separate_as %w(des tru ir)
|
122
|
+
expect('destruição').to separate_as %w(des tru i ção)
|
123
|
+
|
124
|
+
#expect('acuidade').to separate_as %w(a cu i da de)
|
125
|
+
#expect('ajuizar').to separate_as %w(a ju i zar)
|
126
|
+
end
|
127
|
+
|
128
|
+
it 'separates random words' do
|
129
|
+
expect('acidentariamente').to separate_as %w(a ci den ta ri a men te)
|
130
|
+
expect('cooperar').to separate_as %w(co o pe rar)
|
131
|
+
expect('abstraído').to separate_as %w(abs tra í do)
|
132
|
+
expect('abstenção').to separate_as %w(abs ten ção)
|
133
|
+
expect('colapso').to separate_as %w(co lap so)
|
134
|
+
expect('piauí').to separate_as %w(pi au í)
|
135
|
+
expect('aguei').to separate_as %w(a guei)
|
136
|
+
expect('compreender').to separate_as %w(com pre en der)
|
137
|
+
expect('caatinga').to separate_as %w(ca a tin ga)
|
138
|
+
expect('atmosfera').to separate_as %w(at mos fe ra)
|
139
|
+
|
140
|
+
expect('tuiuiú').to separate_as %w(tui ui ú)
|
141
|
+
|
142
|
+
# I really don't buy the whole "comes from latin" thing.
|
143
|
+
# Our separation if phonetic based, so, keep it that way!
|
144
|
+
expect('abrupto').to separate_as %w(a brup to)
|
145
|
+
expect('abruptamente').to separate_as %w(a brup ta men te)
|
146
|
+
# For example, the word bellow is correctly separated
|
147
|
+
# (as the dictionary says). =\
|
148
|
+
expect('abrupção').to separate_as %w(a brup ção)
|
149
|
+
end
|
150
|
+
end
|
metadata
CHANGED
@@ -1,61 +1,75 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pascoale
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ronie Uliana
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-04-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.5'
|
20
|
-
- -
|
20
|
+
- - ">="
|
21
21
|
- !ruby/object:Gem::Version
|
22
22
|
version: 1.5.1
|
23
23
|
type: :development
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
27
|
-
- - ~>
|
27
|
+
- - "~>"
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: '1.5'
|
30
|
-
- -
|
30
|
+
- - ">="
|
31
31
|
- !ruby/object:Gem::Version
|
32
32
|
version: 1.5.1
|
33
33
|
- !ruby/object:Gem::Dependency
|
34
34
|
name: rake
|
35
35
|
requirement: !ruby/object:Gem::Requirement
|
36
36
|
requirements:
|
37
|
-
- -
|
37
|
+
- - ">="
|
38
38
|
- !ruby/object:Gem::Version
|
39
39
|
version: '0'
|
40
40
|
type: :development
|
41
41
|
prerelease: false
|
42
42
|
version_requirements: !ruby/object:Gem::Requirement
|
43
43
|
requirements:
|
44
|
-
- -
|
44
|
+
- - ">="
|
45
45
|
- !ruby/object:Gem::Version
|
46
46
|
version: '0'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: rspec
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - "~>"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 3.0.beta
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - "~>"
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 3.0.beta
|
47
61
|
- !ruby/object:Gem::Dependency
|
48
62
|
name: guard-rspec
|
49
63
|
requirement: !ruby/object:Gem::Requirement
|
50
64
|
requirements:
|
51
|
-
- -
|
65
|
+
- - ">="
|
52
66
|
- !ruby/object:Gem::Version
|
53
67
|
version: '0'
|
54
68
|
type: :development
|
55
69
|
prerelease: false
|
56
70
|
version_requirements: !ruby/object:Gem::Requirement
|
57
71
|
requirements:
|
58
|
-
- -
|
72
|
+
- - ">="
|
59
73
|
- !ruby/object:Gem::Version
|
60
74
|
version: '0'
|
61
75
|
description:
|
@@ -65,17 +79,26 @@ executables: []
|
|
65
79
|
extensions: []
|
66
80
|
extra_rdoc_files: []
|
67
81
|
files:
|
68
|
-
- .gitignore
|
82
|
+
- ".gitignore"
|
83
|
+
- ".ruby-gemset"
|
84
|
+
- ".ruby-version"
|
69
85
|
- Gemfile
|
70
86
|
- Guardfile
|
71
87
|
- LICENSE.txt
|
72
88
|
- README.md
|
73
89
|
- Rakefile
|
90
|
+
- data/errors.txt
|
91
|
+
- data/everything.txt
|
92
|
+
- data/unique_errors.txt
|
74
93
|
- lib/pascoale.rb
|
94
|
+
- lib/pascoale/constants.rb
|
75
95
|
- lib/pascoale/edits.rb
|
96
|
+
- lib/pascoale/syllable_separator.rb
|
97
|
+
- lib/pascoale/syllable_separator_benchmark.rb
|
76
98
|
- lib/pascoale/version.rb
|
77
99
|
- pascoale.gemspec
|
78
100
|
- spec/lib/pascoale/edits_spec.rb
|
101
|
+
- spec/lib/pascoale/syllable_separator_spec.rb
|
79
102
|
- spec/spec_helper.rb
|
80
103
|
homepage: http://github.com/ruliana/pascoale
|
81
104
|
licenses:
|
@@ -87,20 +110,21 @@ require_paths:
|
|
87
110
|
- lib
|
88
111
|
required_ruby_version: !ruby/object:Gem::Requirement
|
89
112
|
requirements:
|
90
|
-
- -
|
113
|
+
- - ">="
|
91
114
|
- !ruby/object:Gem::Version
|
92
115
|
version: '0'
|
93
116
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
94
117
|
requirements:
|
95
|
-
- -
|
118
|
+
- - ">="
|
96
119
|
- !ruby/object:Gem::Version
|
97
120
|
version: '0'
|
98
121
|
requirements: []
|
99
122
|
rubyforge_project:
|
100
|
-
rubygems_version: 2.
|
123
|
+
rubygems_version: 2.2.0
|
101
124
|
signing_key:
|
102
125
|
specification_version: 4
|
103
126
|
summary: Text processing utilities for Brazilian Portuguese
|
104
127
|
test_files:
|
105
128
|
- spec/lib/pascoale/edits_spec.rb
|
129
|
+
- spec/lib/pascoale/syllable_separator_spec.rb
|
106
130
|
- spec/spec_helper.rb
|