rbpar 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +58 -0
- data/bin/rbpar.rb +94 -0
- data/lib/rbpar_engine.rb +196 -0
- data/lib/rbpar_main.rb +68 -0
- data/lib/rbpar_paragraph.rb +344 -0
- data/test/rbpar_engine_test.rb +152 -0
- data/test/rbpar_main_test.rb +212 -0
- data/test/rbpar_paragraph_test.rb +305 -0
- data/test/rbpar_test.rb +11 -0
- metadata +54 -0
data/README
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
*** RBPAR README ***
|
2
|
+
|
3
|
+
|
4
|
+
INTRODUCTION
|
5
|
+
|
6
|
+
rbpar is a program and an accompanying library suite meant
|
7
|
+
for formatting text paragraphs. The library includes an
|
8
|
+
implementation of the optimal line breaking algorithm by
|
9
|
+
Donald E. Knuth and a semi-intelligent paragraph parser that
|
10
|
+
is not confused by email quotes. This is due to the fact
|
11
|
+
that rbpar was first designed for email writing.
|
12
|
+
|
13
|
+
The program executable 'rbpar.rb' is installed to the
|
14
|
+
executable path by default. The program read input from
|
15
|
+
STDIN and outputs formatted paragraphs to STDOUT.
|
16
|
+
|
17
|
+
|
18
|
+
INSTALLATION
|
19
|
+
|
20
|
+
If you received rbpar gem, you can install the gem with
|
21
|
+
"gem install rbpar-0.1.0.gem". An easier alternative is to
|
22
|
+
use rubygems to download and install it directly with "gem
|
23
|
+
install rbpar". The file rbpar.rb should be automaticly
|
24
|
+
added to your path.
|
25
|
+
|
26
|
+
|
27
|
+
USAGE
|
28
|
+
|
29
|
+
rbpar.rb [-v] [-w <number>]
|
30
|
+
|
31
|
+
rbpar.rb accepts the following parameters:
|
32
|
+
|
33
|
+
-v, --vim : Use vim-style line endings (with an ending space)
|
34
|
+
-w, --width: Set the desired line width
|
35
|
+
|
36
|
+
Example usage to format a text file:
|
37
|
+
|
38
|
+
cat unprocessed.txt | rbpar.rb > processed.txt
|
39
|
+
|
40
|
+
|
41
|
+
VIM INTEGRATION
|
42
|
+
|
43
|
+
To make vim 'gq' operator use rbpar, use the following
|
44
|
+
command either in vim command menu or in your vimrc file:
|
45
|
+
|
46
|
+
set formatprg=rbpar.rb\ -w\ 63\ -v
|
47
|
+
|
48
|
+
This sets the desired line width to be 62 characters.
|
49
|
+
One extra space is inserted after every line that does
|
50
|
+
not end a paragraph. This enables vim (with format-option
|
51
|
+
'w') to understand the paragraph lines. This is handy when
|
52
|
+
using the 'auto-format' format option. See vim help for
|
53
|
+
"formatoptions", "fo-table" and "auto-format" for details.
|
54
|
+
|
55
|
+
|
56
|
+
CONTACT
|
57
|
+
|
58
|
+
Please email comments and suggestions to <ismo@iki.fi>.
|
data/bin/rbpar.rb
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
# The command line utility for using the rbpar library.
|
4
|
+
#
|
5
|
+
# Copyright Ismo Puustinen 2007.
|
6
|
+
|
7
|
+
require 'rbpar_main'
|
8
|
+
require 'getoptlong'
|
9
|
+
|
10
|
+
# This is the main program for the rbpar paragraph processing library.
|
11
|
+
# The idea is to use the program as a stdin->stdout processor and pipe
|
12
|
+
# the input data to it. For example, this would work:
|
13
|
+
#
|
14
|
+
# $ cat inputfile.txt | ruby rbpar.rb -w 63 > outputfile.txt
|
15
|
+
#
|
16
|
+
# Thus, the basic usage is identical to 'fmt' and 'par'.
|
17
|
+
|
18
|
+
def main
|
19
|
+
|
20
|
+
# get the options from the command line
|
21
|
+
|
22
|
+
parser = GetoptLong.new()
|
23
|
+
parser.set_options(
|
24
|
+
["-h", "--help", GetoptLong::NO_ARGUMENT],
|
25
|
+
["-w", "--width", GetoptLong::OPTIONAL_ARGUMENT],
|
26
|
+
["-v", "--vim", GetoptLong::NO_ARGUMENT]
|
27
|
+
)
|
28
|
+
|
29
|
+
# --vim means that an extra space should be left after each
|
30
|
+
# non-empty line to signify that the next line is also part of the
|
31
|
+
# same paragraph
|
32
|
+
|
33
|
+
# set default values
|
34
|
+
width = 68
|
35
|
+
vim = false
|
36
|
+
|
37
|
+
loop do
|
38
|
+
begin
|
39
|
+
opt, arg = parser.get
|
40
|
+
break if not opt
|
41
|
+
|
42
|
+
case opt
|
43
|
+
when "-h"
|
44
|
+
puts "Usage:\n\trbpar.rb [-w width] [-v] [-h]"
|
45
|
+
exit(0)
|
46
|
+
when "-w"
|
47
|
+
width = arg.to_i
|
48
|
+
when "-v"
|
49
|
+
vim = true
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# reserve room for the extra space
|
55
|
+
width -= 1 if vim
|
56
|
+
|
57
|
+
# check the parameters
|
58
|
+
unless width > 0
|
59
|
+
error_text = "Negative or zero paragraph width!"
|
60
|
+
error_text += " (Note: using the -v option causes the resevation of one character)" if vim
|
61
|
+
raise ArgumentError, error_text
|
62
|
+
end
|
63
|
+
|
64
|
+
rbpar = RbParIterator.new(width)
|
65
|
+
first = true
|
66
|
+
|
67
|
+
# add lines to the parser object ...
|
68
|
+
rbpar << $stdin.readlines
|
69
|
+
|
70
|
+
# ... and read back ready paragraphs
|
71
|
+
rbpar.each do |paragraph|
|
72
|
+
# add a line break before each paragraph (except the first)
|
73
|
+
unless first
|
74
|
+
#$stdout.write("\n") unless first
|
75
|
+
else
|
76
|
+
first = false
|
77
|
+
end
|
78
|
+
# write the lines to stdout
|
79
|
+
paragraph.each do |line|
|
80
|
+
$stdout.write(line.rstrip)
|
81
|
+
$stdout.write(" ") if vim
|
82
|
+
$stdout.write("\n")
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
|
88
|
+
# run the program!
|
89
|
+
|
90
|
+
# this if-statement is for non-gem testing
|
91
|
+
#if __FILE__ == $PROGRAM_NAME
|
92
|
+
main
|
93
|
+
#end
|
94
|
+
|
data/lib/rbpar_engine.rb
ADDED
@@ -0,0 +1,196 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
# Implementation of the optimal line breaking algorithm.
|
4
|
+
#
|
5
|
+
# Copyright Ismo Puustinen 2007.
|
6
|
+
|
7
|
+
class ResultHash < Hash
|
8
|
+
|
9
|
+
END_VALUE = "end"
|
10
|
+
|
11
|
+
def initialize(words)
|
12
|
+
super(END_VALUE)
|
13
|
+
@words = words
|
14
|
+
end
|
15
|
+
|
16
|
+
def each()
|
17
|
+
|
18
|
+
# each successive call to each yields another line
|
19
|
+
|
20
|
+
i = 0
|
21
|
+
while (nextIndex = self[i.to_s]) != END_VALUE
|
22
|
+
yield @words[i...nextIndex].join(" ")
|
23
|
+
i = nextIndex
|
24
|
+
end
|
25
|
+
# also the last line
|
26
|
+
yield @words[i...@words.length].join(" ")
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class DynamicBreaker
|
31
|
+
|
32
|
+
# DynamicBreaker implements Donald Knuth's algorithm for allocating
|
33
|
+
# words to lines in optimal way. The algorithm uses dynamic programming
|
34
|
+
# to accomplish this in reasonable time.
|
35
|
+
|
36
|
+
def initialize
|
37
|
+
@charLengthCache = Hash.new
|
38
|
+
@cost = Hash.new(0)
|
39
|
+
end
|
40
|
+
|
41
|
+
def charLength(words, beginIndex, endIndex)
|
42
|
+
|
43
|
+
# see if the result is already cached
|
44
|
+
|
45
|
+
token = beginIndex.to_s+ "_" + endIndex.to_s
|
46
|
+
cachedResult = @charLengthCache[token]
|
47
|
+
if cachedResult.nil?
|
48
|
+
length = realCharLength(words, beginIndex, endIndex)
|
49
|
+
return @charLengthCache[token] = length
|
50
|
+
end
|
51
|
+
return cachedResult
|
52
|
+
end
|
53
|
+
|
54
|
+
def realCharLength(words, beginIndex, endIndex)
|
55
|
+
|
56
|
+
# sum the lenghts of the words and add 1 for the space after the
|
57
|
+
# word
|
58
|
+
|
59
|
+
return (beginIndex...endIndex).inject(0) do |sum, i|
|
60
|
+
sum + words[i].length + 1
|
61
|
+
end - 1 # remove the last space
|
62
|
+
end
|
63
|
+
|
64
|
+
def calculatePenalty(words, availableLength, beginIndex, endIndex)
|
65
|
+
|
66
|
+
# get the line length
|
67
|
+
|
68
|
+
actualLength = charLength(words, beginIndex, endIndex)
|
69
|
+
|
70
|
+
if availableLength < actualLength
|
71
|
+
print "Error: too long line"
|
72
|
+
end
|
73
|
+
|
74
|
+
# The optimum value should be bit less than the line width
|
75
|
+
|
76
|
+
optimumLength = availableLength
|
77
|
+
|
78
|
+
if availableLength > 20
|
79
|
+
optimumLength = (availableLength*0.95).to_i
|
80
|
+
end
|
81
|
+
|
82
|
+
# penalty is the whitespace length at the end of the line to
|
83
|
+
# second power
|
84
|
+
|
85
|
+
lineScore = (optimumLength - actualLength)**2 + @cost[endIndex.to_s]
|
86
|
+
end
|
87
|
+
|
88
|
+
def findBreak(words, availableLength, beginIndex)
|
89
|
+
|
90
|
+
# find the best way from here to the end of the input string
|
91
|
+
|
92
|
+
first = true
|
93
|
+
smallestPenalty = 0
|
94
|
+
smallestPenaltyIndex = beginIndex + 1
|
95
|
+
|
96
|
+
((beginIndex+1)...words.length).each do |i|
|
97
|
+
unless isLegal(words, availableLength, beginIndex, i)
|
98
|
+
# the next word is no longer in the line length range
|
99
|
+
if i == beginIndex + 1
|
100
|
+
# The first word is too long to fit on one line.
|
101
|
+
# This is a special case, and means that the word
|
102
|
+
# should not be part of any route. This is why
|
103
|
+
# we set the penalty to be maximum (so that the
|
104
|
+
# previous word would be the preferred endpoint
|
105
|
+
# for a line) and the smallestPenaltyIndex to be
|
106
|
+
# the next word (so that a new line would begin
|
107
|
+
# there).
|
108
|
+
penalty = 100000000 # FIXME: infinite (MAXINT)
|
109
|
+
end
|
110
|
+
break
|
111
|
+
end
|
112
|
+
|
113
|
+
if first
|
114
|
+
first = false
|
115
|
+
smallestPenalty = calculatePenalty(words, availableLength, beginIndex, i)
|
116
|
+
smallestPenaltyIndex = i
|
117
|
+
else
|
118
|
+
penalty = calculatePenalty(words, availableLength, beginIndex, i)
|
119
|
+
if penalty < smallestPenalty
|
120
|
+
smallestPenalty = penalty
|
121
|
+
smallestPenaltyIndex = i
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
return smallestPenaltyIndex, smallestPenalty
|
127
|
+
end
|
128
|
+
|
129
|
+
def isLegal(words, availableLength, beginIndex, endIndex)
|
130
|
+
charLength(words, beginIndex, endIndex) <= availableLength
|
131
|
+
end
|
132
|
+
|
133
|
+
def parse(words, availableLength)
|
134
|
+
|
135
|
+
# check the parameters, since this is the public entrypoint to
|
136
|
+
# the algorithm
|
137
|
+
|
138
|
+
if words.nil? or availableLength <= 0
|
139
|
+
if words.nil?
|
140
|
+
puts "Nil words!"
|
141
|
+
else
|
142
|
+
puts "available length <= 0"
|
143
|
+
end
|
144
|
+
raise ArgumentError, "Incorrect parameters to parse function!"
|
145
|
+
elsif words.empty?
|
146
|
+
# no words
|
147
|
+
return []
|
148
|
+
elsif words.length == 1
|
149
|
+
# only one word
|
150
|
+
return words
|
151
|
+
end
|
152
|
+
|
153
|
+
n = words.length
|
154
|
+
|
155
|
+
# one initialization is enough
|
156
|
+
@charLengthCache.clear
|
157
|
+
@cost.clear
|
158
|
+
|
159
|
+
# initialize with the words. Note: not as the default value
|
160
|
+
nextBreak = ResultHash.new(words)
|
161
|
+
|
162
|
+
# index n-1 is the last word, always on last line
|
163
|
+
(n-2).downto(0) do |i|
|
164
|
+
unless charLength(words, i, n) <= availableLength
|
165
|
+
# normal case, not the last line
|
166
|
+
|
167
|
+
# The last line needs to have cost to next break as 0
|
168
|
+
# and the next break as end state. This is done with
|
169
|
+
# implicit Hash default values.
|
170
|
+
|
171
|
+
r, penalty = findBreak(words, availableLength, i)
|
172
|
+
@cost[i.to_s] = @cost[r.to_s] + penalty
|
173
|
+
nextBreak[i.to_s] = r
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
# get the lines from the ResultHash
|
178
|
+
|
179
|
+
return nextBreak.collect
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
if __FILE__ == $PROGRAM_NAME
|
184
|
+
|
185
|
+
# test code
|
186
|
+
|
187
|
+
breaker = DynamicBreaker.new()
|
188
|
+
|
189
|
+
words = "Sed eget ligula. Nunc fringilla. In ullamcorper turpis quis tortor. Maecenas fringilla dui aliquet leo. Nulla nec mi ut mauris ultrices sollicitudin. Mauris feugiat ornare massa. Ut vitae dolor sed urna blandit imperdiet. Cras tempus, orci sollicitudin pulvinar ultricies, sapien urna fringilla risus, eu rhoncus metus nisi a risus. Aliquam erat volutpat. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Duis iaculis lorem sit amet neque. Cras quis tellus. Ut molestie eros sit amet nibh blandit luctus. Quisque ac sem. Nulla condimentum eros rhoncus ipsum. Duis enim. Phasellus mattis posuere augue.".split(" ")
|
190
|
+
|
191
|
+
results = breaker.parse(words, 70)
|
192
|
+
results.each do |line|
|
193
|
+
puts line.length.to_s + " " + line
|
194
|
+
end
|
195
|
+
|
196
|
+
end
|
data/lib/rbpar_main.rb
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
# Implementation of a paragraph parsing library API for the rbpar suite.
|
4
|
+
# Use this class as the main entry point if you require high-level
|
5
|
+
# functions.
|
6
|
+
#
|
7
|
+
# Copyright Ismo Puustinen 2007.
|
8
|
+
|
9
|
+
require 'rbpar_engine'
|
10
|
+
require 'rbpar_paragraph'
|
11
|
+
|
12
|
+
class RbParIterator
|
13
|
+
|
14
|
+
# RbParIterator provides the API for accessing the parsing and line
|
15
|
+
# breaking system.
|
16
|
+
|
17
|
+
include Enumerable
|
18
|
+
|
19
|
+
def initialize(width)
|
20
|
+
unless width > 0
|
21
|
+
raise ArgumentError, "Negative or zero paragraph width!"
|
22
|
+
end
|
23
|
+
@breaker = DynamicBreaker.new()
|
24
|
+
@width = width
|
25
|
+
@readlines = Array.new()
|
26
|
+
end
|
27
|
+
|
28
|
+
def <<(lines)
|
29
|
+
@readlines = @readlines + lines
|
30
|
+
end
|
31
|
+
|
32
|
+
def process_paragraph(lines)
|
33
|
+
|
34
|
+
# the paragraph is (at this point) only an array of lines
|
35
|
+
|
36
|
+
paragraph = Paragraph.new(lines)
|
37
|
+
|
38
|
+
# split the paragraph to split paragraphs according to e-mail
|
39
|
+
# quoting etc.
|
40
|
+
paragraphs = paragraph.get_paragraphs
|
41
|
+
|
42
|
+
# results has the processed sub-paragraphs
|
43
|
+
results = paragraphs.collect do |sub_paragraph|
|
44
|
+
# process removes the line breaks: it doesn't make sense to
|
45
|
+
# send pure line breaks there
|
46
|
+
if sub_paragraph[0] != "\n"
|
47
|
+
sub_paragraph.process!(@breaker, @width)
|
48
|
+
else
|
49
|
+
[""]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# combine the result paragraphs into one
|
54
|
+
return results.inject([]) do |total, paragraph|
|
55
|
+
total + paragraph
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
def each
|
61
|
+
@readlines.join.each('') do |paragraph|
|
62
|
+
lines = paragraph.collect
|
63
|
+
yield process_paragraph(lines)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
|