ovec 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/TODO +1 -0
- data/bin/ovec +5 -18
- data/lib/ovec/tier.rb +27 -11
- data/lib/ovec/version.rb +1 -1
- data/test/lib/ovec/tier.rb +39 -17
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 072ab08b381a445811c64db29452dd28f5b1c575
|
4
|
+
data.tar.gz: 7fc5d8aa41477ffcf71499df5d44cc88f35a52dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 22dbbd36ec18a956479d356ba949405b3e211631b3ec89f1f30d0e9435a69a7d1d4590b525b1d909dc57d88974cc80d38f6f2a6ffdf2276f5e25c403667e5aa9
|
7
|
+
data.tar.gz: 2e193c2a77a1daefeff432b8ab8da85f8f70807e76e1194268f0170fa72a69c9a844352466df5b3e760612fdcf1d60985a7328c2e12d5f533fb48f00c3704ffc
|
data/TODO
CHANGED
data/bin/ovec
CHANGED
@@ -6,13 +6,9 @@ require 'optparse'
|
|
6
6
|
options = {}
|
7
7
|
|
8
8
|
opts_parser = OptionParser.new do |opts|
|
9
|
-
opts.banner = "Usage: ovec [options]"
|
9
|
+
opts.banner = "Usage: ovec [options] [input file(s)]"
|
10
10
|
|
11
|
-
opts.on("-
|
12
|
-
options[:input] = filename
|
13
|
-
end
|
14
|
-
|
15
|
-
opts.on("-o", "--output FILENAME", "Output filename or - for stdout (required)") do |filename|
|
11
|
+
opts.on("-o", "--output FILENAME", "Output filename (STDOUT is used if omitted)") do |filename|
|
16
12
|
options[:output] = filename
|
17
13
|
end
|
18
14
|
|
@@ -31,19 +27,11 @@ opts_parser = OptionParser.new do |opts|
|
|
31
27
|
end
|
32
28
|
end
|
33
29
|
|
34
|
-
input_file = nil
|
35
30
|
output_file = nil
|
36
31
|
|
37
32
|
begin
|
38
33
|
opts_parser.parse!
|
39
|
-
|
40
|
-
$stderr.puts "You must specify both an input file and an output file."
|
41
|
-
$stderr.puts opts_parser.help
|
42
|
-
exit 1
|
43
|
-
end
|
44
|
-
|
45
|
-
input_file = (options[:input] == '-') ? STDIN : File.open(options[:input], "r")
|
46
|
-
output_file = (options[:output] == '-') ? STDOUT : File.open(options[:output], "w")
|
34
|
+
output_file = options[:output] ? File.open(options[:output], "w") : STDOUT
|
47
35
|
rescue OptionParser::ParseError => ex
|
48
36
|
$stderr.puts "Invalid options: #{ex.message}"
|
49
37
|
$stderr.puts opts_parser.help
|
@@ -53,7 +41,8 @@ rescue StandardError => ex
|
|
53
41
|
exit 1
|
54
42
|
end
|
55
43
|
|
56
|
-
|
44
|
+
# OptionParser#parse! changes ARGV, so ARGF works here.
|
45
|
+
content = ARGF.read
|
57
46
|
|
58
47
|
parser = Ovec::Parser.new(debug: options[:debug])
|
59
48
|
tree = parser.parse(content)
|
@@ -66,6 +55,4 @@ tier = Ovec::Tier.new
|
|
66
55
|
tm.run_text_manipulator(tier)
|
67
56
|
|
68
57
|
output_file.puts tree.to_tex
|
69
|
-
|
70
|
-
input_file.close
|
71
58
|
output_file.close
|
data/lib/ovec/tier.rb
CHANGED
@@ -4,20 +4,20 @@ module Ovec
|
|
4
4
|
class Tier < TextManipulator
|
5
5
|
# The last character this regex matches is changed to a tilde.
|
6
6
|
REGEX = /(
|
7
|
-
((\p{Z}
|
8
|
-
([\.\?\!](\p{Z}|\~)+[KSVZOUAI]\p{Z})| # KSVZOUAI na zacatku vety
|
9
|
-
(\A[KSVZOUAI]\p{Z})| # KSVZOUAI na zacatku textu
|
7
|
+
((\p{Z}|[~\n()\[\]\{\}])[KkSsVvZzOoUu](\p{Z}|\n))| # KSVZOU jako samostatne slovo
|
8
|
+
([\.\?\!](\p{Z}|\~)+[KSVZOUAI](\p{Z}|\n))| # KSVZOUAI na zacatku vety
|
9
|
+
(\A[KSVZOUAI](\p{Z}|\n))| # KSVZOUAI na zacatku textu
|
10
10
|
(\p{Z}(?=--(\p{Z}|\n)))| # mezera, za kterou je pomlcka
|
11
|
-
(,(\p{Z}|\~|\n)+a\p{Z}) # ... modulo 10, a~timto prvkem ...; TODO: plati tohle i pro "i"?
|
11
|
+
(,(\p{Z}|\~|\n)+a(\p{Z}|\n)) # ... modulo 10, a~timto prvkem ...; TODO: plati tohle i pro "i"?
|
12
12
|
)/x
|
13
13
|
|
14
14
|
# TODO: generally tie "5.~batalion", ...
|
15
15
|
# All changes within this regex are changed to a tilde.
|
16
16
|
DATE_REGEX = /(
|
17
|
-
(?<=\p{Z})\p{Nd}{1,2}\.\p{Z}
|
17
|
+
(?<=\p{Z}|\A)\p{Nd}{1,2}\.\p{Z}
|
18
18
|
(\p{Nd}{1,2}\.|leden|únor|březen|duben|květen|červen|červenec|srpen|září|říjen|listopad|prosinec| # TODO: plne sklonovani? nebo nejaky wildcard?
|
19
19
|
ledna|února|března|dubna|května|června|července|srpna|září|října|listopadu|prosince)\p{Z}
|
20
|
-
\p{Nd}{4}(
|
20
|
+
\p{Nd}{4}(?=(\p{Z}|[.,?!]|\Z)) # Datum jako "1. 5. 2013"
|
21
21
|
)/x
|
22
22
|
|
23
23
|
def run
|
@@ -33,7 +33,24 @@ module Ovec
|
|
33
33
|
match = matches[i]
|
34
34
|
change = match.end(0) - 1
|
35
35
|
chunk, offset = _find_chunk_and_offset(change)
|
36
|
+
|
37
|
+
former_character = chunk[offset]
|
38
|
+
|
36
39
|
chunk[offset] = '~'
|
40
|
+
|
41
|
+
if former_character == '\n'
|
42
|
+
# If we changed a newline to a tilde, change previous space to a
|
43
|
+
# newline -- move the tied word to the other line.
|
44
|
+
j = change - 1
|
45
|
+
while j >= 0
|
46
|
+
if @joined[j] == ' '
|
47
|
+
chunk, offset = _find_chunk_and_offset(j)
|
48
|
+
chunk[offset] = '\n'
|
49
|
+
break
|
50
|
+
end
|
51
|
+
j -= 1
|
52
|
+
end
|
53
|
+
end
|
37
54
|
end
|
38
55
|
|
39
56
|
_rejoin
|
@@ -42,11 +59,10 @@ module Ovec
|
|
42
59
|
|
43
60
|
# Dates can't overlap. 1 scan is enough.
|
44
61
|
matches = @joined.to_enum(:scan, DATE_REGEX).map { Regexp.last_match }
|
45
|
-
for
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
chunk, offset = _find_chunk_and_offset(j)
|
62
|
+
for match in matches
|
63
|
+
for i in (match.begin(0))...(match.end(0))
|
64
|
+
if @joined[i] == ' '
|
65
|
+
chunk, offset = _find_chunk_and_offset(i)
|
50
66
|
chunk[offset] = '~'
|
51
67
|
end
|
52
68
|
end
|
data/lib/ovec/version.rb
CHANGED
data/test/lib/ovec/tier.rb
CHANGED
@@ -6,32 +6,43 @@ module Ovec
|
|
6
6
|
@tier = Tier.new
|
7
7
|
end
|
8
8
|
|
9
|
+
private
|
10
|
+
def assert_ties_to(input, output)
|
11
|
+
parser = Ovec::Parser.new(debug: true)
|
12
|
+
tree = parser.parse(input.dup)
|
13
|
+
|
14
|
+
tm = Ovec::TexManipulator.new
|
15
|
+
tm.bind(tree)
|
16
|
+
|
17
|
+
tm.run_text_manipulator(@tier)
|
18
|
+
|
19
|
+
text = tree.to_tex
|
20
|
+
|
21
|
+
assert_equal output, text
|
22
|
+
end
|
23
|
+
|
24
|
+
public
|
9
25
|
def test_basic_without_ties
|
10
26
|
text = "Ahoj. Jak se máš?"
|
11
27
|
text_duplicate = text.dup
|
12
|
-
@tier.bind([text_duplicate])
|
13
|
-
@tier.run
|
14
|
-
assert_equal text, text_duplicate
|
15
|
-
end
|
16
28
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
@tier
|
24
|
-
|
29
|
+
parser = Ovec::Parser.new(debug: true)
|
30
|
+
tree = parser.parse(text)
|
31
|
+
|
32
|
+
tm = Ovec::TexManipulator.new
|
33
|
+
tm.bind(tree)
|
34
|
+
|
35
|
+
tm.run_text_manipulator(@tier)
|
36
|
+
|
37
|
+
text = tree.to_tex
|
38
|
+
|
39
|
+
assert_equal text, text_duplicate
|
25
40
|
end
|
26
41
|
|
27
42
|
def test_simple_tie
|
28
43
|
assert_ties_to "K blabla u blabla s blabla.", "K~blabla u~blabla s~blabla."
|
29
44
|
end
|
30
45
|
|
31
|
-
def test_array_tie
|
32
|
-
assert_ties_to [ "K blabla u", " blabla ", "s blabla.", " A blabla?" ], [ "K~blabla u", "~blabla ", "s~blabla.", " A~blabla?" ]
|
33
|
-
end
|
34
|
-
|
35
46
|
def test_regex_works
|
36
47
|
regex = Tier::REGEX
|
37
48
|
assert !("ahoj" =~ regex)
|
@@ -50,7 +61,7 @@ module Ovec
|
|
50
61
|
end
|
51
62
|
|
52
63
|
def test_tie_across_newline
|
53
|
-
assert_ties_to "
|
64
|
+
assert_ties_to "Pojednavani pojednavajici\no pojednavani.", "Pojednavani pojednavajici\no~pojednavani."
|
54
65
|
end
|
55
66
|
|
56
67
|
def test_tie_a_after_pause
|
@@ -65,5 +76,16 @@ module Ovec
|
|
65
76
|
def test_tie_various
|
66
77
|
assert_ties_to "Je-li x sudé, je dělitelné dvěma (v opačném případě není).", "Je-li x sudé, je dělitelné dvěma (v~opačném případě není)."
|
67
78
|
end
|
79
|
+
|
80
|
+
def test_tie_in_newline
|
81
|
+
assert_ties_to "V\nrámci\ntohohle", "V~rámci\ntohohle"
|
82
|
+
assert_ties_to "V\nrámci tohohle", "V~rámci tohohle"
|
83
|
+
end
|
84
|
+
|
85
|
+
def test_date_regex_ok
|
86
|
+
assert "10" =~ /\A\p{Nd}*\Z/
|
87
|
+
assert "1. 3. 2013" =~ Tier::DATE_REGEX
|
88
|
+
assert "Bylo zrovna 1. 3. 2013." =~ Tier::DATE_REGEX
|
89
|
+
end
|
68
90
|
end
|
69
91
|
end
|