rbpar 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,58 @@
1
+ *** RBPAR README ***
2
+
3
+
4
+ INTRODUCTION
5
+
6
+ rbpar is a program and an accompanying library suite meant
7
+ for formatting text paragraphs. The library includes an
8
+ implementation of the optimal line breaking algorithm by
9
+ Donald E. Knuth and a semi-intelligent paragraph parser that
10
+ is not confused by email quotes. This is due to the fact
11
+ that rbpar was first designed for email writing.
12
+
13
+ The program executable 'rbpar.rb' is installed to the
14
+ executable path by default. The program read input from
15
+ STDIN and outputs formatted paragraphs to STDOUT.
16
+
17
+
18
+ INSTALLATION
19
+
20
+ If you received rbpar gem, you can install the gem with
21
+ "gem install rbpar-0.1.0.gem". An easier alternative is to
22
+ use rubygems to download and install it directly with "gem
23
+ install rbpar". The file rbpar.rb should be automaticly
24
+ added to your path.
25
+
26
+
27
+ USAGE
28
+
29
+ rbpar.rb [-v] [-w <number>]
30
+
31
+ rbpar.rb accepts the following parameters:
32
+
33
+ -v, --vim : Use vim-style line endings (with an ending space)
34
+ -w, --width: Set the desired line width
35
+
36
+ Example usage to format a text file:
37
+
38
+ cat unprocessed.txt | rbpar.rb > processed.txt
39
+
40
+
41
+ VIM INTEGRATION
42
+
43
+ To make vim 'gq' operator use rbpar, use the following
44
+ command either in vim command menu or in your vimrc file:
45
+
46
+ set formatprg=rbpar.rb\ -w\ 63\ -v
47
+
48
+ This sets the desired line width to be 62 characters.
49
+ One extra space is inserted after every line that does
50
+ not end a paragraph. This enables vim (with format-option
51
+ 'w') to understand the paragraph lines. This is handy when
52
+ using the 'auto-format' format option. See vim help for
53
+ "formatoptions", "fo-table" and "auto-format" for details.
54
+
55
+
56
+ CONTACT
57
+
58
+ Please email comments and suggestions to <ismo@iki.fi>.
data/bin/rbpar.rb ADDED
@@ -0,0 +1,94 @@
1
+ #!/usr/bin/ruby
2
+
3
+ # The command line utility for using the rbpar library.
4
+ #
5
+ # Copyright Ismo Puustinen 2007.
6
+
7
+ require 'rbpar_main'
8
+ require 'getoptlong'
9
+
10
+ # This is the main program for the rbpar paragraph processing library.
11
+ # The idea is to use the program as a stdin->stdout processor and pipe
12
+ # the input data to it. For example, this would work:
13
+ #
14
+ # $ cat inputfile.txt | ruby rbpar.rb -w 63 > outputfile.txt
15
+ #
16
+ # Thus, the basic usage is identical to 'fmt' and 'par'.
17
+
18
+ def main
19
+
20
+ # get the options from the command line
21
+
22
+ parser = GetoptLong.new()
23
+ parser.set_options(
24
+ ["-h", "--help", GetoptLong::NO_ARGUMENT],
25
+ ["-w", "--width", GetoptLong::OPTIONAL_ARGUMENT],
26
+ ["-v", "--vim", GetoptLong::NO_ARGUMENT]
27
+ )
28
+
29
+ # --vim means that an extra space should be left after each
30
+ # non-empty line to signify that the next line is also part of the
31
+ # same paragraph
32
+
33
+ # set default values
34
+ width = 68
35
+ vim = false
36
+
37
+ loop do
38
+ begin
39
+ opt, arg = parser.get
40
+ break if not opt
41
+
42
+ case opt
43
+ when "-h"
44
+ puts "Usage:\n\trbpar.rb [-w width] [-v] [-h]"
45
+ exit(0)
46
+ when "-w"
47
+ width = arg.to_i
48
+ when "-v"
49
+ vim = true
50
+ end
51
+ end
52
+ end
53
+
54
+ # reserve room for the extra space
55
+ width -= 1 if vim
56
+
57
+ # check the parameters
58
+ unless width > 0
59
+ error_text = "Negative or zero paragraph width!"
60
+ error_text += " (Note: using the -v option causes the resevation of one character)" if vim
61
+ raise ArgumentError, error_text
62
+ end
63
+
64
+ rbpar = RbParIterator.new(width)
65
+ first = true
66
+
67
+ # add lines to the parser object ...
68
+ rbpar << $stdin.readlines
69
+
70
+ # ... and read back ready paragraphs
71
+ rbpar.each do |paragraph|
72
+ # add a line break before each paragraph (except the first)
73
+ unless first
74
+ #$stdout.write("\n") unless first
75
+ else
76
+ first = false
77
+ end
78
+ # write the lines to stdout
79
+ paragraph.each do |line|
80
+ $stdout.write(line.rstrip)
81
+ $stdout.write(" ") if vim
82
+ $stdout.write("\n")
83
+ end
84
+ end
85
+
86
+ end
87
+
88
+ # run the program!
89
+
90
+ # this if-statement is for non-gem testing
91
+ #if __FILE__ == $PROGRAM_NAME
92
+ main
93
+ #end
94
+
@@ -0,0 +1,196 @@
1
+ #!/usr/bin/ruby
2
+
3
+ # Implementation of the optimal line breaking algorithm.
4
+ #
5
+ # Copyright Ismo Puustinen 2007.
6
+
7
+ class ResultHash < Hash
8
+
9
+ END_VALUE = "end"
10
+
11
+ def initialize(words)
12
+ super(END_VALUE)
13
+ @words = words
14
+ end
15
+
16
+ def each()
17
+
18
+ # each successive call to each yields another line
19
+
20
+ i = 0
21
+ while (nextIndex = self[i.to_s]) != END_VALUE
22
+ yield @words[i...nextIndex].join(" ")
23
+ i = nextIndex
24
+ end
25
+ # also the last line
26
+ yield @words[i...@words.length].join(" ")
27
+ end
28
+ end
29
+
30
+ class DynamicBreaker
31
+
32
+ # DynamicBreaker implements Donald Knuth's algorithm for allocating
33
+ # words to lines in optimal way. The algorithm uses dynamic programming
34
+ # to accomplish this in reasonable time.
35
+
36
+ def initialize
37
+ @charLengthCache = Hash.new
38
+ @cost = Hash.new(0)
39
+ end
40
+
41
+ def charLength(words, beginIndex, endIndex)
42
+
43
+ # see if the result is already cached
44
+
45
+ token = beginIndex.to_s+ "_" + endIndex.to_s
46
+ cachedResult = @charLengthCache[token]
47
+ if cachedResult.nil?
48
+ length = realCharLength(words, beginIndex, endIndex)
49
+ return @charLengthCache[token] = length
50
+ end
51
+ return cachedResult
52
+ end
53
+
54
+ def realCharLength(words, beginIndex, endIndex)
55
+
56
+ # sum the lenghts of the words and add 1 for the space after the
57
+ # word
58
+
59
+ return (beginIndex...endIndex).inject(0) do |sum, i|
60
+ sum + words[i].length + 1
61
+ end - 1 # remove the last space
62
+ end
63
+
64
+ def calculatePenalty(words, availableLength, beginIndex, endIndex)
65
+
66
+ # get the line length
67
+
68
+ actualLength = charLength(words, beginIndex, endIndex)
69
+
70
+ if availableLength < actualLength
71
+ print "Error: too long line"
72
+ end
73
+
74
+ # The optimum value should be bit less than the line width
75
+
76
+ optimumLength = availableLength
77
+
78
+ if availableLength > 20
79
+ optimumLength = (availableLength*0.95).to_i
80
+ end
81
+
82
+ # penalty is the whitespace length at the end of the line to
83
+ # second power
84
+
85
+ lineScore = (optimumLength - actualLength)**2 + @cost[endIndex.to_s]
86
+ end
87
+
88
+ def findBreak(words, availableLength, beginIndex)
89
+
90
+ # find the best way from here to the end of the input string
91
+
92
+ first = true
93
+ smallestPenalty = 0
94
+ smallestPenaltyIndex = beginIndex + 1
95
+
96
+ ((beginIndex+1)...words.length).each do |i|
97
+ unless isLegal(words, availableLength, beginIndex, i)
98
+ # the next word is no longer in the line length range
99
+ if i == beginIndex + 1
100
+ # The first word is too long to fit on one line.
101
+ # This is a special case, and means that the word
102
+ # should not be part of any route. This is why
103
+ # we set the penalty to be maximum (so that the
104
+ # previous word would be the preferred endpoint
105
+ # for a line) and the smallestPenaltyIndex to be
106
+ # the next word (so that a new line would begin
107
+ # there).
108
+ penalty = 100000000 # FIXME: infinite (MAXINT)
109
+ end
110
+ break
111
+ end
112
+
113
+ if first
114
+ first = false
115
+ smallestPenalty = calculatePenalty(words, availableLength, beginIndex, i)
116
+ smallestPenaltyIndex = i
117
+ else
118
+ penalty = calculatePenalty(words, availableLength, beginIndex, i)
119
+ if penalty < smallestPenalty
120
+ smallestPenalty = penalty
121
+ smallestPenaltyIndex = i
122
+ end
123
+ end
124
+ end
125
+
126
+ return smallestPenaltyIndex, smallestPenalty
127
+ end
128
+
129
+ def isLegal(words, availableLength, beginIndex, endIndex)
130
+ charLength(words, beginIndex, endIndex) <= availableLength
131
+ end
132
+
133
+ def parse(words, availableLength)
134
+
135
+ # check the parameters, since this is the public entrypoint to
136
+ # the algorithm
137
+
138
+ if words.nil? or availableLength <= 0
139
+ if words.nil?
140
+ puts "Nil words!"
141
+ else
142
+ puts "available length <= 0"
143
+ end
144
+ raise ArgumentError, "Incorrect parameters to parse function!"
145
+ elsif words.empty?
146
+ # no words
147
+ return []
148
+ elsif words.length == 1
149
+ # only one word
150
+ return words
151
+ end
152
+
153
+ n = words.length
154
+
155
+ # one initialization is enough
156
+ @charLengthCache.clear
157
+ @cost.clear
158
+
159
+ # initialize with the words. Note: not as the default value
160
+ nextBreak = ResultHash.new(words)
161
+
162
+ # index n-1 is the last word, always on last line
163
+ (n-2).downto(0) do |i|
164
+ unless charLength(words, i, n) <= availableLength
165
+ # normal case, not the last line
166
+
167
+ # The last line needs to have cost to next break as 0
168
+ # and the next break as end state. This is done with
169
+ # implicit Hash default values.
170
+
171
+ r, penalty = findBreak(words, availableLength, i)
172
+ @cost[i.to_s] = @cost[r.to_s] + penalty
173
+ nextBreak[i.to_s] = r
174
+ end
175
+ end
176
+
177
+ # get the lines from the ResultHash
178
+
179
+ return nextBreak.collect
180
+ end
181
+ end
182
+
183
+ if __FILE__ == $PROGRAM_NAME
184
+
185
+ # test code
186
+
187
+ breaker = DynamicBreaker.new()
188
+
189
+ words = "Sed eget ligula. Nunc fringilla. In ullamcorper turpis quis tortor. Maecenas fringilla dui aliquet leo. Nulla nec mi ut mauris ultrices sollicitudin. Mauris feugiat ornare massa. Ut vitae dolor sed urna blandit imperdiet. Cras tempus, orci sollicitudin pulvinar ultricies, sapien urna fringilla risus, eu rhoncus metus nisi a risus. Aliquam erat volutpat. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Duis iaculis lorem sit amet neque. Cras quis tellus. Ut molestie eros sit amet nibh blandit luctus. Quisque ac sem. Nulla condimentum eros rhoncus ipsum. Duis enim. Phasellus mattis posuere augue.".split(" ")
190
+
191
+ results = breaker.parse(words, 70)
192
+ results.each do |line|
193
+ puts line.length.to_s + " " + line
194
+ end
195
+
196
+ end
data/lib/rbpar_main.rb ADDED
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/ruby
2
+
3
+ # Implementation of a paragraph parsing library API for the rbpar suite.
4
+ # Use this class as the main entry point if you require high-level
5
+ # functions.
6
+ #
7
+ # Copyright Ismo Puustinen 2007.
8
+
9
+ require 'rbpar_engine'
10
+ require 'rbpar_paragraph'
11
+
12
+ class RbParIterator
13
+
14
+ # RbParIterator provides the API for accessing the parsing and line
15
+ # breaking system.
16
+
17
+ include Enumerable
18
+
19
+ def initialize(width)
20
+ unless width > 0
21
+ raise ArgumentError, "Negative or zero paragraph width!"
22
+ end
23
+ @breaker = DynamicBreaker.new()
24
+ @width = width
25
+ @readlines = Array.new()
26
+ end
27
+
28
+ def <<(lines)
29
+ @readlines = @readlines + lines
30
+ end
31
+
32
+ def process_paragraph(lines)
33
+
34
+ # the paragraph is (at this point) only an array of lines
35
+
36
+ paragraph = Paragraph.new(lines)
37
+
38
+ # split the paragraph to split paragraphs according to e-mail
39
+ # quoting etc.
40
+ paragraphs = paragraph.get_paragraphs
41
+
42
+ # results has the processed sub-paragraphs
43
+ results = paragraphs.collect do |sub_paragraph|
44
+ # process removes the line breaks: it doesn't make sense to
45
+ # send pure line breaks there
46
+ if sub_paragraph[0] != "\n"
47
+ sub_paragraph.process!(@breaker, @width)
48
+ else
49
+ [""]
50
+ end
51
+ end
52
+
53
+ # combine the result paragraphs into one
54
+ return results.inject([]) do |total, paragraph|
55
+ total + paragraph
56
+ end
57
+
58
+ end
59
+
60
+ def each
61
+ @readlines.join.each('') do |paragraph|
62
+ lines = paragraph.collect
63
+ yield process_paragraph(lines)
64
+ end
65
+ end
66
+
67
+ end
68
+