pascoale 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ module Pascoale
2
+ module Constants
3
+ VOWELS = 'aeiouáéíóúâêôãõày'
4
+ SEMIVOWELS = 'iu'
5
+ CONSONANTS = 'bcdfghjklmnpqrstvwxzç'
6
+ LETTERS = VOWELS + CONSONANTS
7
+ end
8
+ end
@@ -1,5 +1,5 @@
1
1
  class Pascoale::Edits
2
- LETTERS = ' abcdefghijklmnopqrstuvwxyzáéíóúâêôãõç'.scan(/./)
2
+ LETTERS = [' '] + Pascoale::Constants::LETTERS.scan(/./)
3
3
 
4
4
  def initialize(word)
5
5
  @splits = (0..(word.size)).map do |i|
@@ -0,0 +1,44 @@
1
+ module Pascoale
2
+ class SyllableSeparator
3
+ include Constants
4
+
5
+ ONSET = "(?:ch|lh|nh|gu|qu|[pbtdcgfv][lr]|[#{CONSONANTS}])"
6
+
7
+ # Still in doubt if we should add suffixes to the "i" semivowel...
8
+ # it slightly improves the the matches, but some of them causes more
9
+ # noise than fix things =\
10
+ #NUCLEUS = "(?:ãe|ão|õe|[#{VOWELS}](?:u|i(?!nh|r$|m$|dora?$|ção$|dade$))?)"
11
+ NUCLEUS = "(?:ãe|ão|õe|[#{VOWELS}](?:u|i(?!nh|r$|m$|ção$|dora?$))?)"
12
+
13
+ CODA = "[#{CONSONANTS}]"
14
+
15
+ # The concept of "rhyme" does not help in this algorithm. It seems the
16
+ # concept makes no sense for syllable separation in portuguese
17
+ KERNEL = "#{ONSET}?#{NUCLEUS}"
18
+
19
+ def initialize(word)
20
+ @word = word
21
+ end
22
+
23
+ def separated
24
+ rest = @word
25
+ result = []
26
+ while rest && rest.size > 0
27
+ if rest =~ /^(#{KERNEL})(?:(#{KERNEL})|(#{CODA})(#{KERNEL})|(#{CODA}#{CODA})(#{KERNEL})|(#{CODA}#{CODA})|(#{CODA}))?(.*)$/
28
+ result << $1 + $3.to_s + $5.to_s + $7.to_s + $8.to_s
29
+ rest = $2.to_s + $4.to_s + $6.to_s + $9.to_s
30
+ # Special case! Hate them :(
31
+ # Pneu, Gnomo, Mnemônica, Pseudônimo
32
+ elsif result.size == 0
33
+ if rest =~ /^([#{CONSONANTS}]#{KERNEL})(?:(#{KERNEL})|(#{CODA})(#{KERNEL})|(#{CODA}#{CODA})(#{KERNEL})|(#{CODA}#{CODA})|(#{CODA}))?(.*)$/
34
+ result << $1 + $3.to_s + $5.to_s + $7.to_s + $8.to_s
35
+ rest = $2.to_s + $4.to_s + $6.to_s + $9.to_s
36
+ end
37
+ else
38
+ raise %(Cannot separate "#{@word}". No rule match next syllable at "#{result.join('')}|>#{rest}")
39
+ end
40
+ end
41
+ result
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,29 @@
1
+ require 'pascoale'
2
+
3
+ correct_counter = 0
4
+ wrong_counter = 0
5
+ word = nil
6
+
7
+ begin
8
+ open("#{Pascoale.root}/data/everything.txt") do |file|
9
+ file.each_line do |line|
10
+ begin
11
+ word, _, separation = eval(line)
12
+ next if word =~ /\-/
13
+ s = Pascoale::SyllableSeparator.new(word).separated
14
+ if s == separation
15
+ correct_counter += 1
16
+ else
17
+ wrong_counter += 1
18
+ puts "#{word} - #{s} - #{separation}"
19
+ end
20
+ rescue => e
21
+ puts e.message
22
+ end
23
+ end
24
+ end
25
+ ensure
26
+ puts " Last: #{word}"
27
+ puts "Correct: #{correct_counter}"
28
+ puts " Wrong: #{wrong_counter}"
29
+ end
@@ -1,3 +1,3 @@
1
1
  module Pascoale
2
- VERSION = "0.0.1"
2
+ VERSION = "0.1.0"
3
3
  end
data/lib/pascoale.rb CHANGED
@@ -1,5 +1,10 @@
1
- require 'pascoale/version'
2
- require 'pascoale/edits'
3
-
4
1
  module Pascoale
2
+ def self.root
3
+ File.expand_path("#{File.dirname(__FILE__)}/..")
4
+ end
5
5
  end
6
+
7
+ require 'pascoale/version'
8
+ require 'pascoale/constants'
9
+ require 'pascoale/edits'
10
+ require 'pascoale/syllable_separator'
data/pascoale.gemspec CHANGED
@@ -19,5 +19,6 @@ Gem::Specification.new do |spec|
19
19
 
20
20
  spec.add_development_dependency 'bundler', ['~>1.5', '>=1.5.1']
21
21
  spec.add_development_dependency 'rake'
22
+ spec.add_development_dependency 'rspec', '~> 3.0.beta'
22
23
  spec.add_development_dependency 'guard-rspec'
23
24
  end
@@ -0,0 +1,150 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec::Matchers.define :separate_as do |expected|
4
+ result = nil
5
+ match do |actual|
6
+ result = Pascoale::SyllableSeparator.new(actual).separated
7
+ result == expected
8
+ end
9
+ failure_message do |actual|
10
+ %( expected "#{actual}" to separate as "#{expected}", but was "#{result}")
11
+ end
12
+ end
13
+
14
+ describe Pascoale::SyllableSeparator do
15
+ it 'separates simple words' do
16
+ expect('bola').to separate_as %w(bo la)
17
+ expect('batata').to separate_as %w(ba ta ta)
18
+ end
19
+
20
+ it 'keeps some "dígrafos" together' do
21
+ expect('chocalho').to separate_as %w(cho ca lho)
22
+ expect('batuque').to separate_as %w(ba tu que)
23
+ expect('guelha').to separate_as %w(gue lha)
24
+ end
25
+
26
+ it 'separates other "dígrafos"' do
27
+ expect('bossa').to separate_as %w(bos sa)
28
+ expect('bosta').to separate_as %w(bos ta)
29
+ expect('cassado').to separate_as %w(cas sa do)
30
+ expect('carrasco').to separate_as %w(car ras co)
31
+ end
32
+
33
+ it 'handles double consonants border cases' do
34
+ expect('destravar').to separate_as %w(des tra var)
35
+ expect('desencontro').to separate_as %w(de sen con tro)
36
+ expect('exaltar').to separate_as %w(e xal tar)
37
+ expect('excomungar').to separate_as %w(ex co mun gar)
38
+ end
39
+
40
+ it 'keeps "oclusivas" and "fricativas" together with "r" or "l"' do
41
+ expect('brasa').to separate_as %w(bra sa)
42
+ expect('agrupado').to separate_as %w(a gru pa do)
43
+ expect('fragrância').to separate_as %w(fra grân ci a)
44
+ expect('protocriogênico').to separate_as %w(pro to cri o gê ni co)
45
+ end
46
+
47
+ it 'keeps long codas' do
48
+ expect('transpiração').to separate_as %w(trans pi ra ção)
49
+ expect('transatlântico').to separate_as %w(tran sa tlân ti co)
50
+ expect('mirins').to separate_as %w(mi rins)
51
+ expect('transerrano').to separate_as %w(tran ser ra no)
52
+ expect('solstício').to separate_as %w(sols tí ci o)
53
+ expect('perspectiva').to separate_as %w(pers pec ti va)
54
+ expect('substância').to separate_as %w(subs tân ci a)
55
+ # Yes! It's a real word o_O
56
+ expect('falansterialismo').to separate_as %w(fa lans te ri a lis mo)
57
+ end
58
+
59
+ it 'separates single vowels at beggining' do
60
+ expect('abacaxi').to separate_as %w(a ba ca xi)
61
+ expect('exceto').to separate_as %w(ex ce to)
62
+ expect('arrocho').to separate_as %w(ar ro cho)
63
+ end
64
+
65
+ it 'keeps "ditongos" together' do
66
+ expect('maizena').to separate_as %w(mai ze na)
67
+ expect('mausoléu').to separate_as %w(mau so léu)
68
+ expect('ação').to separate_as %w(a ção)
69
+ expect('põe').to separate_as %w(põe)
70
+ expect('exceção').to separate_as %w(ex ce ção)
71
+ end
72
+
73
+ it 'handle "tritongos"' do
74
+ expect('ideia').to separate_as %w(i dei a)
75
+ expect('tireoide').to separate_as %w(ti re oi de)
76
+ expect('praia').to separate_as %w(prai a)
77
+ expect('feio').to separate_as %w(fei o)
78
+ expect('vaia').to separate_as %w(vai a)
79
+ end
80
+
81
+ it 'separates "hiatos"' do
82
+ expect('moeda').to separate_as %w(mo e da)
83
+ expect('leal').to separate_as %w(le al)
84
+ expect('aéreo').to separate_as %w(a é re o)
85
+ expect('pior').to separate_as %w(pi or)
86
+ expect('raíz').to separate_as %w(ra íz)
87
+ expect('ruído').to separate_as %w(ru í do)
88
+ end
89
+
90
+ it 'keeps the first consonant of the word together' do
91
+ expect('pneu').to separate_as %w(pneu)
92
+ expect('apneia').to separate_as %w(ap nei a)
93
+ expect('pneumático').to separate_as %w(pneu má ti co)
94
+ expect('piropneumático').to separate_as %w(pi rop neu má ti co)
95
+ expect('mnemônica').to separate_as %w(mne mô ni ca)
96
+ expect('pseudônimo').to separate_as %w(pseu dô ni mo)
97
+ expect('gnomo').to separate_as %w(gno mo)
98
+ end
99
+
100
+ it 'keeps "sineréses" together' do
101
+ expect('saudade').to separate_as %w(sau da de)
102
+ expect('vaidade').to separate_as %w(vai da de)
103
+ expect('suave').to separate_as %w(su a ve)
104
+
105
+ # Not sure how to deal with these
106
+ #expect('traidor').to separate_as %w(trai dor)
107
+ end
108
+
109
+ it 'separates "dieréses"' do
110
+ expect('conluiador').to separate_as %w(con lui a dor)
111
+ expect('conluio').to separate_as %w(con lui o)
112
+ expect('aleluia').to separate_as %w(a le lui a)
113
+ expect('toluico').to separate_as %w(to lui co)
114
+
115
+ expect('alauita').to separate_as %w(a lau i ta)
116
+
117
+ expect('rainha').to separate_as %w(ra i nha)
118
+ expect('tainheira').to separate_as %w(ta i nhei ra)
119
+
120
+ expect('construir').to separate_as %w(cons tru ir)
121
+ expect('destruir').to separate_as %w(des tru ir)
122
+ expect('destruição').to separate_as %w(des tru i ção)
123
+
124
+ #expect('acuidade').to separate_as %w(a cu i da de)
125
+ #expect('ajuizar').to separate_as %w(a ju i zar)
126
+ end
127
+
128
+ it 'separates random words' do
129
+ expect('acidentariamente').to separate_as %w(a ci den ta ri a men te)
130
+ expect('cooperar').to separate_as %w(co o pe rar)
131
+ expect('abstraído').to separate_as %w(abs tra í do)
132
+ expect('abstenção').to separate_as %w(abs ten ção)
133
+ expect('colapso').to separate_as %w(co lap so)
134
+ expect('piauí').to separate_as %w(pi au í)
135
+ expect('aguei').to separate_as %w(a guei)
136
+ expect('compreender').to separate_as %w(com pre en der)
137
+ expect('caatinga').to separate_as %w(ca a tin ga)
138
+ expect('atmosfera').to separate_as %w(at mos fe ra)
139
+
140
+ expect('tuiuiú').to separate_as %w(tui ui ú)
141
+
142
+ # I really don't buy the whole "comes from latin" thing.
143
+ # Our separation if phonetic based, so, keep it that way!
144
+ expect('abrupto').to separate_as %w(a brup to)
145
+ expect('abruptamente').to separate_as %w(a brup ta men te)
146
+ # For example, the word bellow is correctly separated
147
+ # (as the dictionary says). =\
148
+ expect('abrupção').to separate_as %w(a brup ção)
149
+ end
150
+ end
metadata CHANGED
@@ -1,61 +1,75 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pascoale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ronie Uliana
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-21 00:00:00.000000000 Z
11
+ date: 2014-04-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.5'
20
- - - '>='
20
+ - - ">="
21
21
  - !ruby/object:Gem::Version
22
22
  version: 1.5.1
23
23
  type: :development
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
- - - ~>
27
+ - - "~>"
28
28
  - !ruby/object:Gem::Version
29
29
  version: '1.5'
30
- - - '>='
30
+ - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.5.1
33
33
  - !ruby/object:Gem::Dependency
34
34
  name: rake
35
35
  requirement: !ruby/object:Gem::Requirement
36
36
  requirements:
37
- - - '>='
37
+ - - ">="
38
38
  - !ruby/object:Gem::Version
39
39
  version: '0'
40
40
  type: :development
41
41
  prerelease: false
42
42
  version_requirements: !ruby/object:Gem::Requirement
43
43
  requirements:
44
- - - '>='
44
+ - - ">="
45
45
  - !ruby/object:Gem::Version
46
46
  version: '0'
47
+ - !ruby/object:Gem::Dependency
48
+ name: rspec
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: 3.0.beta
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: 3.0.beta
47
61
  - !ruby/object:Gem::Dependency
48
62
  name: guard-rspec
49
63
  requirement: !ruby/object:Gem::Requirement
50
64
  requirements:
51
- - - '>='
65
+ - - ">="
52
66
  - !ruby/object:Gem::Version
53
67
  version: '0'
54
68
  type: :development
55
69
  prerelease: false
56
70
  version_requirements: !ruby/object:Gem::Requirement
57
71
  requirements:
58
- - - '>='
72
+ - - ">="
59
73
  - !ruby/object:Gem::Version
60
74
  version: '0'
61
75
  description:
@@ -65,17 +79,26 @@ executables: []
65
79
  extensions: []
66
80
  extra_rdoc_files: []
67
81
  files:
68
- - .gitignore
82
+ - ".gitignore"
83
+ - ".ruby-gemset"
84
+ - ".ruby-version"
69
85
  - Gemfile
70
86
  - Guardfile
71
87
  - LICENSE.txt
72
88
  - README.md
73
89
  - Rakefile
90
+ - data/errors.txt
91
+ - data/everything.txt
92
+ - data/unique_errors.txt
74
93
  - lib/pascoale.rb
94
+ - lib/pascoale/constants.rb
75
95
  - lib/pascoale/edits.rb
96
+ - lib/pascoale/syllable_separator.rb
97
+ - lib/pascoale/syllable_separator_benchmark.rb
76
98
  - lib/pascoale/version.rb
77
99
  - pascoale.gemspec
78
100
  - spec/lib/pascoale/edits_spec.rb
101
+ - spec/lib/pascoale/syllable_separator_spec.rb
79
102
  - spec/spec_helper.rb
80
103
  homepage: http://github.com/ruliana/pascoale
81
104
  licenses:
@@ -87,20 +110,21 @@ require_paths:
87
110
  - lib
88
111
  required_ruby_version: !ruby/object:Gem::Requirement
89
112
  requirements:
90
- - - '>='
113
+ - - ">="
91
114
  - !ruby/object:Gem::Version
92
115
  version: '0'
93
116
  required_rubygems_version: !ruby/object:Gem::Requirement
94
117
  requirements:
95
- - - '>='
118
+ - - ">="
96
119
  - !ruby/object:Gem::Version
97
120
  version: '0'
98
121
  requirements: []
99
122
  rubyforge_project:
100
- rubygems_version: 2.1.11
123
+ rubygems_version: 2.2.0
101
124
  signing_key:
102
125
  specification_version: 4
103
126
  summary: Text processing utilities for Brazilian Portuguese
104
127
  test_files:
105
128
  - spec/lib/pascoale/edits_spec.rb
129
+ - spec/lib/pascoale/syllable_separator_spec.rb
106
130
  - spec/spec_helper.rb