ovec 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/TODO +1 -0
- data/bin/ovec +5 -18
- data/lib/ovec/tier.rb +27 -11
- data/lib/ovec/version.rb +1 -1
- data/test/lib/ovec/tier.rb +39 -17
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 072ab08b381a445811c64db29452dd28f5b1c575
|
|
4
|
+
data.tar.gz: 7fc5d8aa41477ffcf71499df5d44cc88f35a52dd
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 22dbbd36ec18a956479d356ba949405b3e211631b3ec89f1f30d0e9435a69a7d1d4590b525b1d909dc57d88974cc80d38f6f2a6ffdf2276f5e25c403667e5aa9
|
|
7
|
+
data.tar.gz: 2e193c2a77a1daefeff432b8ab8da85f8f70807e76e1194268f0170fa72a69c9a844352466df5b3e760612fdcf1d60985a7328c2e12d5f533fb48f00c3704ffc
|
data/TODO
CHANGED
data/bin/ovec
CHANGED
|
@@ -6,13 +6,9 @@ require 'optparse'
|
|
|
6
6
|
options = {}
|
|
7
7
|
|
|
8
8
|
opts_parser = OptionParser.new do |opts|
|
|
9
|
-
opts.banner = "Usage: ovec [options]"
|
|
9
|
+
opts.banner = "Usage: ovec [options] [input file(s)]"
|
|
10
10
|
|
|
11
|
-
opts.on("-
|
|
12
|
-
options[:input] = filename
|
|
13
|
-
end
|
|
14
|
-
|
|
15
|
-
opts.on("-o", "--output FILENAME", "Output filename or - for stdout (required)") do |filename|
|
|
11
|
+
opts.on("-o", "--output FILENAME", "Output filename (STDOUT is used if omitted)") do |filename|
|
|
16
12
|
options[:output] = filename
|
|
17
13
|
end
|
|
18
14
|
|
|
@@ -31,19 +27,11 @@ opts_parser = OptionParser.new do |opts|
|
|
|
31
27
|
end
|
|
32
28
|
end
|
|
33
29
|
|
|
34
|
-
input_file = nil
|
|
35
30
|
output_file = nil
|
|
36
31
|
|
|
37
32
|
begin
|
|
38
33
|
opts_parser.parse!
|
|
39
|
-
|
|
40
|
-
$stderr.puts "You must specify both an input file and an output file."
|
|
41
|
-
$stderr.puts opts_parser.help
|
|
42
|
-
exit 1
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
input_file = (options[:input] == '-') ? STDIN : File.open(options[:input], "r")
|
|
46
|
-
output_file = (options[:output] == '-') ? STDOUT : File.open(options[:output], "w")
|
|
34
|
+
output_file = options[:output] ? File.open(options[:output], "w") : STDOUT
|
|
47
35
|
rescue OptionParser::ParseError => ex
|
|
48
36
|
$stderr.puts "Invalid options: #{ex.message}"
|
|
49
37
|
$stderr.puts opts_parser.help
|
|
@@ -53,7 +41,8 @@ rescue StandardError => ex
|
|
|
53
41
|
exit 1
|
|
54
42
|
end
|
|
55
43
|
|
|
56
|
-
|
|
44
|
+
# OptionParser#parse! changes ARGV, so ARGF works here.
|
|
45
|
+
content = ARGF.read
|
|
57
46
|
|
|
58
47
|
parser = Ovec::Parser.new(debug: options[:debug])
|
|
59
48
|
tree = parser.parse(content)
|
|
@@ -66,6 +55,4 @@ tier = Ovec::Tier.new
|
|
|
66
55
|
tm.run_text_manipulator(tier)
|
|
67
56
|
|
|
68
57
|
output_file.puts tree.to_tex
|
|
69
|
-
|
|
70
|
-
input_file.close
|
|
71
58
|
output_file.close
|
data/lib/ovec/tier.rb
CHANGED
|
@@ -4,20 +4,20 @@ module Ovec
|
|
|
4
4
|
class Tier < TextManipulator
|
|
5
5
|
# The last character this regex matches is changed to a tilde.
|
|
6
6
|
REGEX = /(
|
|
7
|
-
((\p{Z}
|
|
8
|
-
([\.\?\!](\p{Z}|\~)+[KSVZOUAI]\p{Z})| # KSVZOUAI na zacatku vety
|
|
9
|
-
(\A[KSVZOUAI]\p{Z})| # KSVZOUAI na zacatku textu
|
|
7
|
+
((\p{Z}|[~\n()\[\]\{\}])[KkSsVvZzOoUu](\p{Z}|\n))| # KSVZOU jako samostatne slovo
|
|
8
|
+
([\.\?\!](\p{Z}|\~)+[KSVZOUAI](\p{Z}|\n))| # KSVZOUAI na zacatku vety
|
|
9
|
+
(\A[KSVZOUAI](\p{Z}|\n))| # KSVZOUAI na zacatku textu
|
|
10
10
|
(\p{Z}(?=--(\p{Z}|\n)))| # mezera, za kterou je pomlcka
|
|
11
|
-
(,(\p{Z}|\~|\n)+a\p{Z}) # ... modulo 10, a~timto prvkem ...; TODO: plati tohle i pro "i"?
|
|
11
|
+
(,(\p{Z}|\~|\n)+a(\p{Z}|\n)) # ... modulo 10, a~timto prvkem ...; TODO: plati tohle i pro "i"?
|
|
12
12
|
)/x
|
|
13
13
|
|
|
14
14
|
# TODO: generally tie "5.~batalion", ...
|
|
15
15
|
# All changes within this regex are changed to a tilde.
|
|
16
16
|
DATE_REGEX = /(
|
|
17
|
-
(?<=\p{Z})\p{Nd}{1,2}\.\p{Z}
|
|
17
|
+
(?<=\p{Z}|\A)\p{Nd}{1,2}\.\p{Z}
|
|
18
18
|
(\p{Nd}{1,2}\.|leden|únor|březen|duben|květen|červen|červenec|srpen|září|říjen|listopad|prosinec| # TODO: plne sklonovani? nebo nejaky wildcard?
|
|
19
19
|
ledna|února|března|dubna|května|června|července|srpna|září|října|listopadu|prosince)\p{Z}
|
|
20
|
-
\p{Nd}{4}(
|
|
20
|
+
\p{Nd}{4}(?=(\p{Z}|[.,?!]|\Z)) # Datum jako "1. 5. 2013"
|
|
21
21
|
)/x
|
|
22
22
|
|
|
23
23
|
def run
|
|
@@ -33,7 +33,24 @@ module Ovec
|
|
|
33
33
|
match = matches[i]
|
|
34
34
|
change = match.end(0) - 1
|
|
35
35
|
chunk, offset = _find_chunk_and_offset(change)
|
|
36
|
+
|
|
37
|
+
former_character = chunk[offset]
|
|
38
|
+
|
|
36
39
|
chunk[offset] = '~'
|
|
40
|
+
|
|
41
|
+
if former_character == '\n'
|
|
42
|
+
# If we changed a newline to a tilde, change previous space to a
|
|
43
|
+
# newline -- move the tied word to the other line.
|
|
44
|
+
j = change - 1
|
|
45
|
+
while j >= 0
|
|
46
|
+
if @joined[j] == ' '
|
|
47
|
+
chunk, offset = _find_chunk_and_offset(j)
|
|
48
|
+
chunk[offset] = '\n'
|
|
49
|
+
break
|
|
50
|
+
end
|
|
51
|
+
j -= 1
|
|
52
|
+
end
|
|
53
|
+
end
|
|
37
54
|
end
|
|
38
55
|
|
|
39
56
|
_rejoin
|
|
@@ -42,11 +59,10 @@ module Ovec
|
|
|
42
59
|
|
|
43
60
|
# Dates can't overlap. 1 scan is enough.
|
|
44
61
|
matches = @joined.to_enum(:scan, DATE_REGEX).map { Regexp.last_match }
|
|
45
|
-
for
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
chunk, offset = _find_chunk_and_offset(j)
|
|
62
|
+
for match in matches
|
|
63
|
+
for i in (match.begin(0))...(match.end(0))
|
|
64
|
+
if @joined[i] == ' '
|
|
65
|
+
chunk, offset = _find_chunk_and_offset(i)
|
|
50
66
|
chunk[offset] = '~'
|
|
51
67
|
end
|
|
52
68
|
end
|
data/lib/ovec/version.rb
CHANGED
data/test/lib/ovec/tier.rb
CHANGED
|
@@ -6,32 +6,43 @@ module Ovec
|
|
|
6
6
|
@tier = Tier.new
|
|
7
7
|
end
|
|
8
8
|
|
|
9
|
+
private
|
|
10
|
+
def assert_ties_to(input, output)
|
|
11
|
+
parser = Ovec::Parser.new(debug: true)
|
|
12
|
+
tree = parser.parse(input.dup)
|
|
13
|
+
|
|
14
|
+
tm = Ovec::TexManipulator.new
|
|
15
|
+
tm.bind(tree)
|
|
16
|
+
|
|
17
|
+
tm.run_text_manipulator(@tier)
|
|
18
|
+
|
|
19
|
+
text = tree.to_tex
|
|
20
|
+
|
|
21
|
+
assert_equal output, text
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
public
|
|
9
25
|
def test_basic_without_ties
|
|
10
26
|
text = "Ahoj. Jak se máš?"
|
|
11
27
|
text_duplicate = text.dup
|
|
12
|
-
@tier.bind([text_duplicate])
|
|
13
|
-
@tier.run
|
|
14
|
-
assert_equal text, text_duplicate
|
|
15
|
-
end
|
|
16
28
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@tier
|
|
24
|
-
|
|
29
|
+
parser = Ovec::Parser.new(debug: true)
|
|
30
|
+
tree = parser.parse(text)
|
|
31
|
+
|
|
32
|
+
tm = Ovec::TexManipulator.new
|
|
33
|
+
tm.bind(tree)
|
|
34
|
+
|
|
35
|
+
tm.run_text_manipulator(@tier)
|
|
36
|
+
|
|
37
|
+
text = tree.to_tex
|
|
38
|
+
|
|
39
|
+
assert_equal text, text_duplicate
|
|
25
40
|
end
|
|
26
41
|
|
|
27
42
|
def test_simple_tie
|
|
28
43
|
assert_ties_to "K blabla u blabla s blabla.", "K~blabla u~blabla s~blabla."
|
|
29
44
|
end
|
|
30
45
|
|
|
31
|
-
def test_array_tie
|
|
32
|
-
assert_ties_to [ "K blabla u", " blabla ", "s blabla.", " A blabla?" ], [ "K~blabla u", "~blabla ", "s~blabla.", " A~blabla?" ]
|
|
33
|
-
end
|
|
34
|
-
|
|
35
46
|
def test_regex_works
|
|
36
47
|
regex = Tier::REGEX
|
|
37
48
|
assert !("ahoj" =~ regex)
|
|
@@ -50,7 +61,7 @@ module Ovec
|
|
|
50
61
|
end
|
|
51
62
|
|
|
52
63
|
def test_tie_across_newline
|
|
53
|
-
assert_ties_to "
|
|
64
|
+
assert_ties_to "Pojednavani pojednavajici\no pojednavani.", "Pojednavani pojednavajici\no~pojednavani."
|
|
54
65
|
end
|
|
55
66
|
|
|
56
67
|
def test_tie_a_after_pause
|
|
@@ -65,5 +76,16 @@ module Ovec
|
|
|
65
76
|
def test_tie_various
|
|
66
77
|
assert_ties_to "Je-li x sudé, je dělitelné dvěma (v opačném případě není).", "Je-li x sudé, je dělitelné dvěma (v~opačném případě není)."
|
|
67
78
|
end
|
|
79
|
+
|
|
80
|
+
def test_tie_in_newline
|
|
81
|
+
assert_ties_to "V\nrámci\ntohohle", "V~rámci\ntohohle"
|
|
82
|
+
assert_ties_to "V\nrámci tohohle", "V~rámci tohohle"
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def test_date_regex_ok
|
|
86
|
+
assert "10" =~ /\A\p{Nd}*\Z/
|
|
87
|
+
assert "1. 3. 2013" =~ Tier::DATE_REGEX
|
|
88
|
+
assert "Bylo zrovna 1. 3. 2013." =~ Tier::DATE_REGEX
|
|
89
|
+
end
|
|
68
90
|
end
|
|
69
91
|
end
|