strsyntax 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/strsyntax +42 -0
- data/lib/strsyntax.rb +2 -0
- data/lib/strsyntax/stdlib_ext.rb +53 -0
- data/lib/strsyntax/strsyntax.rb +119 -0
- data/test/test_parser.rb +30 -0
- metadata +50 -0
data/bin/strsyntax
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'strsyntax'
|
5
|
+
|
6
|
+
options = {
|
7
|
+
:command => :structure
|
8
|
+
}
|
9
|
+
|
10
|
+
opts = OptionParser.new do |opts|
|
11
|
+
opts.banner = "Usage: string_structure [options] word"
|
12
|
+
opts.separator ""
|
13
|
+
|
14
|
+
opts.on( "-y", "--syntax", "Splits string into syntax elements: [:cv,:vc,:vc]" ) do
|
15
|
+
options[:command] = :syntax
|
16
|
+
end
|
17
|
+
|
18
|
+
opts.on( "-p", "--parts", "Splits string into syntax instances: ['ro','ut','er']" ) do
|
19
|
+
options[:command] = :parts
|
20
|
+
end
|
21
|
+
|
22
|
+
opts.on( "-s", "--structure", "Splits string into syntax structure: [[:cv,'ro'],[:vc,'ut'],[:vc,'er']]" ) do
|
23
|
+
options[:command] = :structure
|
24
|
+
end
|
25
|
+
|
26
|
+
opts.separator ""
|
27
|
+
|
28
|
+
opts.parse!( ARGV )
|
29
|
+
end
|
30
|
+
|
31
|
+
str = opts.default_argv.first
|
32
|
+
|
33
|
+
case options[:command]
|
34
|
+
when :syntax
|
35
|
+
puts str.syntax.inspect
|
36
|
+
when :parts
|
37
|
+
puts str.parts.inspect
|
38
|
+
when :structure
|
39
|
+
puts str.structure.inspect
|
40
|
+
else
|
41
|
+
puts opts
|
42
|
+
end
|
data/lib/strsyntax.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
#
|
2
|
+
# Extend standard library classes with methods required by Ngrams
|
3
|
+
#
|
4
|
+
if !String.respond_to? :syntax
|
5
|
+
class String
|
6
|
+
# Returns an array containing the syntax of the string in terms of consonant and vowel
|
7
|
+
# groups. The groups are CVC, VCV, CV, VC, C, and V. For example:
|
8
|
+
#
|
9
|
+
# "groucho".syntax => [:c,:cv,:vc,:cv]
|
10
|
+
# "harpo".syntax => [:cvc,:cv]
|
11
|
+
# "chico".syntax => [:c,:cvc,:v]
|
12
|
+
# "zeppo".syntax => [:cvc,:cv]
|
13
|
+
# "teasdale".syntax => [:cv,:vc,:cvc,:v]
|
14
|
+
def syntax
|
15
|
+
StringSyntax::Parser.parse( self )
|
16
|
+
end
|
17
|
+
end
|
18
|
+
else
|
19
|
+
raise "Cannot patch in String#syntax as it is already defined!"
|
20
|
+
end
|
21
|
+
|
22
|
+
if !String.respond_to? :parts
|
23
|
+
class String
|
24
|
+
# Return an array containing the constituent parts of the string as represented by
|
25
|
+
# its syntax. For example:
|
26
|
+
#
|
27
|
+
# "groucho".parts => ["g","ro","uc","ho"]
|
28
|
+
# "harpo".parts => ["har","po"]
|
29
|
+
# "chico".parts => ["c","hic", "o"]
|
30
|
+
# "zeppo".parts => ["zep","po"]
|
31
|
+
# "teasdale".parts => ["te","as","dal","e"]
|
32
|
+
def parts
|
33
|
+
StringSyntax::Parser.split( self )
|
34
|
+
end
|
35
|
+
end
|
36
|
+
else
|
37
|
+
raise "Cannot patch in String#parts as it is already defined!"
|
38
|
+
end
|
39
|
+
|
40
|
+
if !String.respond_to? :structure
|
41
|
+
class String
|
42
|
+
# Returns an array containing an amalgam of the syntax and parts of the string
|
43
|
+
# See #syntax and #parts for more information. Example
|
44
|
+
#
|
45
|
+
# "groucho".stmap => [[:c,"g"],[:cv,"ro"],[:vc,"uc"],[:cv,"ho"]]
|
46
|
+
#
|
47
|
+
def structure
|
48
|
+
self.syntax.zip( self.parts )
|
49
|
+
end
|
50
|
+
end
|
51
|
+
else
|
52
|
+
raise "Cannot patch in String#structure as it is already defined!"
|
53
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Can we decode any word into some combination of
|
4
|
+
# CV, VC, CVC, VCV
|
5
|
+
# ?
|
6
|
+
#
|
7
|
+
|
8
|
+
module StringSyntax
|
9
|
+
class Parser
|
10
|
+
C = /^([bcdfghjklmnpqrstvwxyz])/
|
11
|
+
V = /^([aeiou])/
|
12
|
+
CV = /^([bcdfghjklmnpqrstvwxyz][aeiou])/
|
13
|
+
CVC = /^([bcdfghjklmnpqrstvwxyz][aeiou][bcdfghjklmnpqrstvwxyz])/
|
14
|
+
VC = /^([aeiou][bcdfghjklmnpqrstvwxyz])/
|
15
|
+
VCV = /^([aeiou][bcdfghjklmnpqrstvwxyz][aeiou])/
|
16
|
+
|
17
|
+
SCHEMA_TEMPLATES = {
|
18
|
+
C => :c,
|
19
|
+
V => :v,
|
20
|
+
CV => :cv,
|
21
|
+
CVC => :cvc,
|
22
|
+
VC => :vc,
|
23
|
+
VCV => :vcv
|
24
|
+
}
|
25
|
+
|
26
|
+
# Returns an array containing the components of the string in terms of consonant
|
27
|
+
# and vowel groupings: CVC, VCV, CV, VC, C, V.
|
28
|
+
def self.parse( s )
|
29
|
+
s = s.downcase
|
30
|
+
|
31
|
+
structure = case s
|
32
|
+
when CVC
|
33
|
+
[ parse_subpart( CVC, s[0,3], s[(3..-1)] ), parse_subpart( CV, s[0,2], s[(2..-1)] ) ]
|
34
|
+
when CV
|
35
|
+
[ parse_subpart( CV, s[0,2], s[(2..-1)] ), parse_subpart( C, s[0,1], s[(1..-1)] ) ]
|
36
|
+
when C
|
37
|
+
[ parse_subpart( C, s[0,1], s[(1..-1)] ) ]
|
38
|
+
when VCV
|
39
|
+
[ parse_subpart( VCV, s[0,3], s[(3..-1)] ), parse_subpart( VC, s[0,2], s[(2..-1)] ) ]
|
40
|
+
when VC
|
41
|
+
[ parse_subpart( VC, s[0,2], s[(2..-1)] ), parse_subpart( V, s[0,1], s[(1..-1)] ) ]
|
42
|
+
when V
|
43
|
+
[ parse_subpart( V, s[0,1], s[(1..-1)] ) ]
|
44
|
+
else
|
45
|
+
[]
|
46
|
+
end
|
47
|
+
|
48
|
+
# Return the sub-structure containing the least number of stray :c and :v
|
49
|
+
structure.sort_by { |s| score( s ) }.first
|
50
|
+
end
|
51
|
+
|
52
|
+
# Return the number of :c and :v components in the structure
|
53
|
+
def self.score( s )
|
54
|
+
s.inject( 0 ) { |sum, e| e == :c || e == :v ? sum + 1 : sum }
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.split( s, structure = parse( s ) )
|
58
|
+
templates = structure.map { |schema| template_from_schema( schema ) }
|
59
|
+
templates.map do |template|
|
60
|
+
match_data = template.match( s )
|
61
|
+
s = match_data.post_match
|
62
|
+
match_data[1]
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.schema_from_template( template )
|
67
|
+
SCHEMA_TEMPLATES[template]
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.template_from_schema( schema )
|
71
|
+
case schema
|
72
|
+
when :cvc
|
73
|
+
CVC
|
74
|
+
when :vcv
|
75
|
+
VCV
|
76
|
+
when :cv
|
77
|
+
CV
|
78
|
+
when :vc
|
79
|
+
VC
|
80
|
+
when :c
|
81
|
+
C
|
82
|
+
when :v
|
83
|
+
V
|
84
|
+
else
|
85
|
+
raise "Unknown schema: #{schema}"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.schema_from_string( s )
|
90
|
+
case s
|
91
|
+
when CVC
|
92
|
+
:cvc
|
93
|
+
when CV
|
94
|
+
:cv
|
95
|
+
when VCV
|
96
|
+
:vcv
|
97
|
+
when VC
|
98
|
+
:vc
|
99
|
+
when C
|
100
|
+
:c
|
101
|
+
when V
|
102
|
+
:v
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
private
|
107
|
+
def self.parse_subpart( template, match, rest )
|
108
|
+
[ schema_from_template( template ), *parse( rest ) ].compact
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
if __FILE__ == $0
|
114
|
+
WORDS = %w( hello google amazon ookles paoga linguistics antidisestablishmentarianism )
|
115
|
+
|
116
|
+
WORDS.each do |word|
|
117
|
+
puts "#{word} -> #{ StringSyntax::Parser.parse( word ).inspect}"
|
118
|
+
end
|
119
|
+
end
|
data/test/test_parser.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
$:<< File.join( File.dirname( __FILE__ ), '..', 'lib' )
|
2
|
+
|
3
|
+
require 'strsyntax'
|
4
|
+
require 'test/unit'
|
5
|
+
|
6
|
+
class TestParser < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_syntax
|
9
|
+
assert_equal [:cvc,:cv], "hello".syntax
|
10
|
+
assert_equal [:vc,:c,:cvc,:vcv], "amplitude".syntax
|
11
|
+
assert_equal [:c,:cvc,:c,:cvc,:cvc], "transmitter".syntax
|
12
|
+
assert_equal [:cv,:vc,:cv], "google".syntax
|
13
|
+
assert_equal [:vcv,:cv,:cv,:vc], "aluminium".syntax
|
14
|
+
assert_equal [:cv,:vc,:cv,:cv], "teasdale".syntax
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_parts
|
18
|
+
assert_equal ["hel","lo"], "hello".parts
|
19
|
+
assert_equal ["am","p","lit","ude"], "amplitude".parts
|
20
|
+
assert_equal ["t","ran","s","mit","ter"], "transmitter".parts
|
21
|
+
assert_equal ["go","og","le"], "google".parts
|
22
|
+
assert_equal ["alu","mi", "ni","um"], "aluminium".parts
|
23
|
+
assert_equal ["te","as","da","le"], "teasdale".parts
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_structure
|
27
|
+
assert_equal [[:c,"g"],[:cv,"ro"],[:vc,"uc"],[:cv,"ho"]], "groucho".structure
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
metadata
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.0
|
3
|
+
specification_version: !int:Fixnum 1
|
4
|
+
name: strsyntax
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.2
|
7
|
+
date: 2006-07-28 00:00:00 +01:00
|
8
|
+
summary: A library to return the syntax/structure of a word in term of consonant and vowel groups.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: self@mattmower.com
|
12
|
+
homepage: http://rubyforge.org/projects/rubymatt/
|
13
|
+
rubyforge_project: rubymatt
|
14
|
+
description:
|
15
|
+
autorequire: strsyntax
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Matt Mower
|
31
|
+
files:
|
32
|
+
- lib/strsyntax.rb
|
33
|
+
- lib/strsyntax/stdlib_ext.rb
|
34
|
+
- lib/strsyntax/strsyntax.rb
|
35
|
+
- test/test_parser.rb
|
36
|
+
- bin/strsyntax
|
37
|
+
test_files: []
|
38
|
+
|
39
|
+
rdoc_options: []
|
40
|
+
|
41
|
+
extra_rdoc_files: []
|
42
|
+
|
43
|
+
executables:
|
44
|
+
- strsyntax
|
45
|
+
extensions: []
|
46
|
+
|
47
|
+
requirements: []
|
48
|
+
|
49
|
+
dependencies: []
|
50
|
+
|