pascoale 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,8 @@
1
+ module Pascoale
2
+ module Constants
3
+ VOWELS = 'aeiouáéíóúâêôãõày'
4
+ SEMIVOWELS = 'iu'
5
+ CONSONANTS = 'bcdfghjklmnpqrstvwxzç'
6
+ LETTERS = VOWELS + CONSONANTS
7
+ end
8
+ end
@@ -1,5 +1,5 @@
1
1
  class Pascoale::Edits
2
- LETTERS = ' abcdefghijklmnopqrstuvwxyzáéíóúâêôãõç'.scan(/./)
2
+ LETTERS = [' '] + Pascoale::Constants::LETTERS.scan(/./)
3
3
 
4
4
  def initialize(word)
5
5
  @splits = (0..(word.size)).map do |i|
@@ -0,0 +1,44 @@
1
+ module Pascoale
2
+ class SyllableSeparator
3
+ include Constants
4
+
5
+ ONSET = "(?:ch|lh|nh|gu|qu|[pbtdcgfv][lr]|[#{CONSONANTS}])"
6
+
7
+ # Still in doubt if we should add suffixes to the "i" semivowel...
8
+ # it slightly improves the the matches, but some of them causes more
9
+ # noise than fix things =\
10
+ #NUCLEUS = "(?:ãe|ão|õe|[#{VOWELS}](?:u|i(?!nh|r$|m$|dora?$|ção$|dade$))?)"
11
+ NUCLEUS = "(?:ãe|ão|õe|[#{VOWELS}](?:u|i(?!nh|r$|m$|ção$|dora?$))?)"
12
+
13
+ CODA = "[#{CONSONANTS}]"
14
+
15
+ # The concept of "rhyme" does not help in this algorithm. It seems the
16
+ # concept makes no sense for syllable separation in portuguese
17
+ KERNEL = "#{ONSET}?#{NUCLEUS}"
18
+
19
+ def initialize(word)
20
+ @word = word
21
+ end
22
+
23
+ def separated
24
+ rest = @word
25
+ result = []
26
+ while rest && rest.size > 0
27
+ if rest =~ /^(#{KERNEL})(?:(#{KERNEL})|(#{CODA})(#{KERNEL})|(#{CODA}#{CODA})(#{KERNEL})|(#{CODA}#{CODA})|(#{CODA}))?(.*)$/
28
+ result << $1 + $3.to_s + $5.to_s + $7.to_s + $8.to_s
29
+ rest = $2.to_s + $4.to_s + $6.to_s + $9.to_s
30
+ # Special case! Hate them :(
31
+ # Pneu, Gnomo, Mnemônica, Pseudônimo
32
+ elsif result.size == 0
33
+ if rest =~ /^([#{CONSONANTS}]#{KERNEL})(?:(#{KERNEL})|(#{CODA})(#{KERNEL})|(#{CODA}#{CODA})(#{KERNEL})|(#{CODA}#{CODA})|(#{CODA}))?(.*)$/
34
+ result << $1 + $3.to_s + $5.to_s + $7.to_s + $8.to_s
35
+ rest = $2.to_s + $4.to_s + $6.to_s + $9.to_s
36
+ end
37
+ else
38
+ raise %(Cannot separate "#{@word}". No rule match next syllable at "#{result.join('')}|>#{rest}")
39
+ end
40
+ end
41
+ result
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,29 @@
1
+ require 'pascoale'
2
+
3
+ correct_counter = 0
4
+ wrong_counter = 0
5
+ word = nil
6
+
7
+ begin
8
+ open("#{Pascoale.root}/data/everything.txt") do |file|
9
+ file.each_line do |line|
10
+ begin
11
+ word, _, separation = eval(line)
12
+ next if word =~ /\-/
13
+ s = Pascoale::SyllableSeparator.new(word).separated
14
+ if s == separation
15
+ correct_counter += 1
16
+ else
17
+ wrong_counter += 1
18
+ puts "#{word} - #{s} - #{separation}"
19
+ end
20
+ rescue => e
21
+ puts e.message
22
+ end
23
+ end
24
+ end
25
+ ensure
26
+ puts " Last: #{word}"
27
+ puts "Correct: #{correct_counter}"
28
+ puts " Wrong: #{wrong_counter}"
29
+ end
@@ -1,3 +1,3 @@
1
1
  module Pascoale
2
- VERSION = "0.0.1"
2
+ VERSION = "0.1.0"
3
3
  end
data/lib/pascoale.rb CHANGED
@@ -1,5 +1,10 @@
1
- require 'pascoale/version'
2
- require 'pascoale/edits'
3
-
4
1
  module Pascoale
2
+ def self.root
3
+ File.expand_path("#{File.dirname(__FILE__)}/..")
4
+ end
5
5
  end
6
+
7
+ require 'pascoale/version'
8
+ require 'pascoale/constants'
9
+ require 'pascoale/edits'
10
+ require 'pascoale/syllable_separator'
data/pascoale.gemspec CHANGED
@@ -19,5 +19,6 @@ Gem::Specification.new do |spec|
19
19
 
20
20
  spec.add_development_dependency 'bundler', ['~>1.5', '>=1.5.1']
21
21
  spec.add_development_dependency 'rake'
22
+ spec.add_development_dependency 'rspec', '~> 3.0.beta'
22
23
  spec.add_development_dependency 'guard-rspec'
23
24
  end
@@ -0,0 +1,150 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec::Matchers.define :separate_as do |expected|
4
+ result = nil
5
+ match do |actual|
6
+ result = Pascoale::SyllableSeparator.new(actual).separated
7
+ result == expected
8
+ end
9
+ failure_message do |actual|
10
+ %( expected "#{actual}" to separate as "#{expected}", but was "#{result}")
11
+ end
12
+ end
13
+
14
+ describe Pascoale::SyllableSeparator do
15
+ it 'separates simple words' do
16
+ expect('bola').to separate_as %w(bo la)
17
+ expect('batata').to separate_as %w(ba ta ta)
18
+ end
19
+
20
+ it 'keeps some "dígrafos" together' do
21
+ expect('chocalho').to separate_as %w(cho ca lho)
22
+ expect('batuque').to separate_as %w(ba tu que)
23
+ expect('guelha').to separate_as %w(gue lha)
24
+ end
25
+
26
+ it 'separates other "dígrafos"' do
27
+ expect('bossa').to separate_as %w(bos sa)
28
+ expect('bosta').to separate_as %w(bos ta)
29
+ expect('cassado').to separate_as %w(cas sa do)
30
+ expect('carrasco').to separate_as %w(car ras co)
31
+ end
32
+
33
+ it 'handles double consonants border cases' do
34
+ expect('destravar').to separate_as %w(des tra var)
35
+ expect('desencontro').to separate_as %w(de sen con tro)
36
+ expect('exaltar').to separate_as %w(e xal tar)
37
+ expect('excomungar').to separate_as %w(ex co mun gar)
38
+ end
39
+
40
+ it 'keeps "oclusivas" and "fricativas" together with "r" or "l"' do
41
+ expect('brasa').to separate_as %w(bra sa)
42
+ expect('agrupado').to separate_as %w(a gru pa do)
43
+ expect('fragrância').to separate_as %w(fra grân ci a)
44
+ expect('protocriogênico').to separate_as %w(pro to cri o gê ni co)
45
+ end
46
+
47
+ it 'keeps long codas' do
48
+ expect('transpiração').to separate_as %w(trans pi ra ção)
49
+ expect('transatlântico').to separate_as %w(tran sa tlân ti co)
50
+ expect('mirins').to separate_as %w(mi rins)
51
+ expect('transerrano').to separate_as %w(tran ser ra no)
52
+ expect('solstício').to separate_as %w(sols tí ci o)
53
+ expect('perspectiva').to separate_as %w(pers pec ti va)
54
+ expect('substância').to separate_as %w(subs tân ci a)
55
+ # Yes! It's a real word o_O
56
+ expect('falansterialismo').to separate_as %w(fa lans te ri a lis mo)
57
+ end
58
+
59
+ it 'separates single vowels at beggining' do
60
+ expect('abacaxi').to separate_as %w(a ba ca xi)
61
+ expect('exceto').to separate_as %w(ex ce to)
62
+ expect('arrocho').to separate_as %w(ar ro cho)
63
+ end
64
+
65
+ it 'keeps "ditongos" together' do
66
+ expect('maizena').to separate_as %w(mai ze na)
67
+ expect('mausoléu').to separate_as %w(mau so léu)
68
+ expect('ação').to separate_as %w(a ção)
69
+ expect('põe').to separate_as %w(põe)
70
+ expect('exceção').to separate_as %w(ex ce ção)
71
+ end
72
+
73
+ it 'handle "tritongos"' do
74
+ expect('ideia').to separate_as %w(i dei a)
75
+ expect('tireoide').to separate_as %w(ti re oi de)
76
+ expect('praia').to separate_as %w(prai a)
77
+ expect('feio').to separate_as %w(fei o)
78
+ expect('vaia').to separate_as %w(vai a)
79
+ end
80
+
81
+ it 'separates "hiatos"' do
82
+ expect('moeda').to separate_as %w(mo e da)
83
+ expect('leal').to separate_as %w(le al)
84
+ expect('aéreo').to separate_as %w(a é re o)
85
+ expect('pior').to separate_as %w(pi or)
86
+ expect('raíz').to separate_as %w(ra íz)
87
+ expect('ruído').to separate_as %w(ru í do)
88
+ end
89
+
90
+ it 'keeps the first consonant of the word together' do
91
+ expect('pneu').to separate_as %w(pneu)
92
+ expect('apneia').to separate_as %w(ap nei a)
93
+ expect('pneumático').to separate_as %w(pneu má ti co)
94
+ expect('piropneumático').to separate_as %w(pi rop neu má ti co)
95
+ expect('mnemônica').to separate_as %w(mne mô ni ca)
96
+ expect('pseudônimo').to separate_as %w(pseu dô ni mo)
97
+ expect('gnomo').to separate_as %w(gno mo)
98
+ end
99
+
100
+ it 'keeps "sineréses" together' do
101
+ expect('saudade').to separate_as %w(sau da de)
102
+ expect('vaidade').to separate_as %w(vai da de)
103
+ expect('suave').to separate_as %w(su a ve)
104
+
105
+ # Not sure how to deal with these
106
+ #expect('traidor').to separate_as %w(trai dor)
107
+ end
108
+
109
+ it 'separates "dieréses"' do
110
+ expect('conluiador').to separate_as %w(con lui a dor)
111
+ expect('conluio').to separate_as %w(con lui o)
112
+ expect('aleluia').to separate_as %w(a le lui a)
113
+ expect('toluico').to separate_as %w(to lui co)
114
+
115
+ expect('alauita').to separate_as %w(a lau i ta)
116
+
117
+ expect('rainha').to separate_as %w(ra i nha)
118
+ expect('tainheira').to separate_as %w(ta i nhei ra)
119
+
120
+ expect('construir').to separate_as %w(cons tru ir)
121
+ expect('destruir').to separate_as %w(des tru ir)
122
+ expect('destruição').to separate_as %w(des tru i ção)
123
+
124
+ #expect('acuidade').to separate_as %w(a cu i da de)
125
+ #expect('ajuizar').to separate_as %w(a ju i zar)
126
+ end
127
+
128
+ it 'separates random words' do
129
+ expect('acidentariamente').to separate_as %w(a ci den ta ri a men te)
130
+ expect('cooperar').to separate_as %w(co o pe rar)
131
+ expect('abstraído').to separate_as %w(abs tra í do)
132
+ expect('abstenção').to separate_as %w(abs ten ção)
133
+ expect('colapso').to separate_as %w(co lap so)
134
+ expect('piauí').to separate_as %w(pi au í)
135
+ expect('aguei').to separate_as %w(a guei)
136
+ expect('compreender').to separate_as %w(com pre en der)
137
+ expect('caatinga').to separate_as %w(ca a tin ga)
138
+ expect('atmosfera').to separate_as %w(at mos fe ra)
139
+
140
+ expect('tuiuiú').to separate_as %w(tui ui ú)
141
+
142
+ # I really don't buy the whole "comes from latin" thing.
143
+ # Our separation if phonetic based, so, keep it that way!
144
+ expect('abrupto').to separate_as %w(a brup to)
145
+ expect('abruptamente').to separate_as %w(a brup ta men te)
146
+ # For example, the word bellow is correctly separated
147
+ # (as the dictionary says). =\
148
+ expect('abrupção').to separate_as %w(a brup ção)
149
+ end
150
+ end
metadata CHANGED
@@ -1,61 +1,75 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pascoale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ronie Uliana
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-21 00:00:00.000000000 Z
11
+ date: 2014-04-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.5'
20
- - - '>='
20
+ - - ">="
21
21
  - !ruby/object:Gem::Version
22
22
  version: 1.5.1
23
23
  type: :development
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
- - - ~>
27
+ - - "~>"
28
28
  - !ruby/object:Gem::Version
29
29
  version: '1.5'
30
- - - '>='
30
+ - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.5.1
33
33
  - !ruby/object:Gem::Dependency
34
34
  name: rake
35
35
  requirement: !ruby/object:Gem::Requirement
36
36
  requirements:
37
- - - '>='
37
+ - - ">="
38
38
  - !ruby/object:Gem::Version
39
39
  version: '0'
40
40
  type: :development
41
41
  prerelease: false
42
42
  version_requirements: !ruby/object:Gem::Requirement
43
43
  requirements:
44
- - - '>='
44
+ - - ">="
45
45
  - !ruby/object:Gem::Version
46
46
  version: '0'
47
+ - !ruby/object:Gem::Dependency
48
+ name: rspec
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: 3.0.beta
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: 3.0.beta
47
61
  - !ruby/object:Gem::Dependency
48
62
  name: guard-rspec
49
63
  requirement: !ruby/object:Gem::Requirement
50
64
  requirements:
51
- - - '>='
65
+ - - ">="
52
66
  - !ruby/object:Gem::Version
53
67
  version: '0'
54
68
  type: :development
55
69
  prerelease: false
56
70
  version_requirements: !ruby/object:Gem::Requirement
57
71
  requirements:
58
- - - '>='
72
+ - - ">="
59
73
  - !ruby/object:Gem::Version
60
74
  version: '0'
61
75
  description:
@@ -65,17 +79,26 @@ executables: []
65
79
  extensions: []
66
80
  extra_rdoc_files: []
67
81
  files:
68
- - .gitignore
82
+ - ".gitignore"
83
+ - ".ruby-gemset"
84
+ - ".ruby-version"
69
85
  - Gemfile
70
86
  - Guardfile
71
87
  - LICENSE.txt
72
88
  - README.md
73
89
  - Rakefile
90
+ - data/errors.txt
91
+ - data/everything.txt
92
+ - data/unique_errors.txt
74
93
  - lib/pascoale.rb
94
+ - lib/pascoale/constants.rb
75
95
  - lib/pascoale/edits.rb
96
+ - lib/pascoale/syllable_separator.rb
97
+ - lib/pascoale/syllable_separator_benchmark.rb
76
98
  - lib/pascoale/version.rb
77
99
  - pascoale.gemspec
78
100
  - spec/lib/pascoale/edits_spec.rb
101
+ - spec/lib/pascoale/syllable_separator_spec.rb
79
102
  - spec/spec_helper.rb
80
103
  homepage: http://github.com/ruliana/pascoale
81
104
  licenses:
@@ -87,20 +110,21 @@ require_paths:
87
110
  - lib
88
111
  required_ruby_version: !ruby/object:Gem::Requirement
89
112
  requirements:
90
- - - '>='
113
+ - - ">="
91
114
  - !ruby/object:Gem::Version
92
115
  version: '0'
93
116
  required_rubygems_version: !ruby/object:Gem::Requirement
94
117
  requirements:
95
- - - '>='
118
+ - - ">="
96
119
  - !ruby/object:Gem::Version
97
120
  version: '0'
98
121
  requirements: []
99
122
  rubyforge_project:
100
- rubygems_version: 2.1.11
123
+ rubygems_version: 2.2.0
101
124
  signing_key:
102
125
  specification_version: 4
103
126
  summary: Text processing utilities for Brazilian Portuguese
104
127
  test_files:
105
128
  - spec/lib/pascoale/edits_spec.rb
129
+ - spec/lib/pascoale/syllable_separator_spec.rb
106
130
  - spec/spec_helper.rb