crawdad 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +56 -0
- data/ext/crawdad/Makefile +25 -0
- data/ext/crawdad/breakpoint.h +53 -0
- data/ext/crawdad/paragraph.c +275 -0
- data/ext/crawdad/paragraph.h +29 -0
- data/ext/crawdad/tokens.c +57 -0
- data/ext/crawdad/tokens.h +41 -0
- data/lib/crawdad.rb +18 -0
- data/lib/crawdad/breakpoint.rb +82 -0
- data/lib/crawdad/compatibility.rb +12 -0
- data/lib/crawdad/ffi.rb +7 -0
- data/lib/crawdad/ffi/breakpoint_node.rb +36 -0
- data/lib/crawdad/ffi/paragraph.rb +58 -0
- data/lib/crawdad/ffi/tokens.rb +71 -0
- data/lib/crawdad/native.rb +11 -0
- data/lib/crawdad/paragraph.rb +293 -0
- data/lib/crawdad/prawn_tokenizer.rb +139 -0
- data/lib/crawdad/tokens.rb +48 -0
- metadata +75 -0
@@ -0,0 +1,139 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# Crawdad: Knuth-Plass linebreaking in Ruby.
|
3
|
+
#
|
4
|
+
# Copyright February 2010, Brad Ediger. All Rights Reserved.
|
5
|
+
#
|
6
|
+
# This is free software. Please see the LICENSE and COPYING files for details.
|
7
|
+
|
8
|
+
require 'strscan'
|
9
|
+
require 'enumerator'
|
10
|
+
|
11
|
+
module Crawdad
|
12
|
+
|
13
|
+
# Ambassador to Prawn. Turns a paragraph into wrappable items.
|
14
|
+
#
|
15
|
+
class PrawnTokenizer
|
16
|
+
|
17
|
+
include Tokens
|
18
|
+
|
19
|
+
# Sets up a tokenizer for the given document (+pdf+).
|
20
|
+
#
|
21
|
+
def initialize(pdf)
|
22
|
+
@pdf = pdf
|
23
|
+
end
|
24
|
+
|
25
|
+
# Tokenize the given paragraph of text into a stream of items (boxes, glue,
|
26
|
+
# and penalties).
|
27
|
+
#
|
28
|
+
# +options+:
|
29
|
+
#
|
30
|
+
# +hyphenation+::
|
31
|
+
# If provided, allow the given text to be hyphenated as needed to best
|
32
|
+
# fit the available space. Requires the text-hyphen gem. Allowable values:
|
33
|
+
# an ISO 639 language code (like 'pt'), or +true+ (synonym for 'en_us').
|
34
|
+
# +indent+::
|
35
|
+
# If specified, indent the first line of the paragraph by the given
|
36
|
+
# number of PDF points.
|
37
|
+
#
|
38
|
+
def paragraph(text, options={})
|
39
|
+
hyphenator = if options[:hyphenation]
|
40
|
+
begin
|
41
|
+
gem 'text-hyphen'
|
42
|
+
require 'text/hyphen'
|
43
|
+
rescue LoadError
|
44
|
+
raise LoadError, ":hyphenation option requires the text-hyphen gem"
|
45
|
+
end
|
46
|
+
|
47
|
+
language = ((lang = options[:hyphenation]) == true) ? 'en_us' : lang
|
48
|
+
@hyphenators ||= {}
|
49
|
+
@hyphenators[language] ||= Text::Hyphen.new(:language => language)
|
50
|
+
end
|
51
|
+
|
52
|
+
stream = []
|
53
|
+
|
54
|
+
if w = options[:indent]
|
55
|
+
stream << box(w, "")
|
56
|
+
end
|
57
|
+
|
58
|
+
# Interword glue can stretch by half and shrink by a third.
|
59
|
+
# TODO: optimal stretch/shrink ratios
|
60
|
+
space_width = @pdf.width_of(" ")
|
61
|
+
interword_glue = glue(space_width,
|
62
|
+
space_width / 2.0,
|
63
|
+
space_width / 3.0)
|
64
|
+
|
65
|
+
# TODO: optimal values for sentence space w/y/z
|
66
|
+
sentence_space_width = space_width * 1.5
|
67
|
+
sentence_glue = glue(sentence_space_width,
|
68
|
+
sentence_space_width / 2.0,
|
69
|
+
sentence_space_width / 3.0)
|
70
|
+
|
71
|
+
# Break paragraph on whitespace.
|
72
|
+
# TODO: how should "battle-\nfield" be tokenized?
|
73
|
+
text.strip.split(/\s+/).each do |word|
|
74
|
+
w = StringScanner.new(word)
|
75
|
+
|
76
|
+
# For hyphenated words, follow each hyphen by a zero-width flagged
|
77
|
+
# penalty.
|
78
|
+
# TODO: recognize dashes in all their variants
|
79
|
+
while seg = w.scan(/[^-]+-/) # "night-time" --> "<<night->>time"
|
80
|
+
stream.concat add_word_segment(seg, hyphenator)
|
81
|
+
end
|
82
|
+
|
83
|
+
stream.concat(add_word_segment(w.rest, hyphenator))
|
84
|
+
|
85
|
+
# TODO: add ties (~) or some other way to denote a period that
|
86
|
+
# doesn't end a sentence.
|
87
|
+
if w.rest =~ /\.$/
|
88
|
+
stream << sentence_glue
|
89
|
+
else
|
90
|
+
stream << interword_glue
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Remove extra glue at the end.
|
95
|
+
stream.pop if token_type(stream.last) == :glue
|
96
|
+
|
97
|
+
# Finish paragraph with a penalty inhibiting a break, finishing glue (to
|
98
|
+
# pad out the last line), and a forced break to finish the paragraph.
|
99
|
+
stream << penalty(Infinity)
|
100
|
+
stream << glue(0, Infinity, 0)
|
101
|
+
stream << penalty(-Infinity)
|
102
|
+
|
103
|
+
stream
|
104
|
+
end
|
105
|
+
|
106
|
+
protected
|
107
|
+
|
108
|
+
# Returns a series of tokens representing the given word. Hyphenates using
|
109
|
+
# the given +hyphenator+, if provided. Appends a zero-width flagged penalty
|
110
|
+
# if the given word ends in a hyphen.
|
111
|
+
#
|
112
|
+
def add_word_segment(word, hyphenator)
|
113
|
+
tokens = []
|
114
|
+
|
115
|
+
if hyphenator
|
116
|
+
hyphen_width = @pdf.width_of('-')
|
117
|
+
|
118
|
+
splits = hyphenator.hyphenate(word)
|
119
|
+
# For each hyphenated segment, add the box with an optional penalty.
|
120
|
+
[0, *splits].each_cons(2) do |a, b|
|
121
|
+
seg = word[a...b]
|
122
|
+
tokens << box(@pdf.width_of(seg), seg)
|
123
|
+
tokens << penalty(50, @pdf.width_of('-'), true)
|
124
|
+
end
|
125
|
+
|
126
|
+
last = word[(splits.last || 0)..-1]
|
127
|
+
tokens << box(@pdf.width_of(last), last)
|
128
|
+
else
|
129
|
+
tokens << box(@pdf.width_of(word), word)
|
130
|
+
end
|
131
|
+
|
132
|
+
tokens << penalty(50, 0, true) if word =~ /-$/
|
133
|
+
tokens
|
134
|
+
end
|
135
|
+
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
139
|
+
|
@@ -0,0 +1,48 @@
|
|
1
|
+
|
2
|
+
module Crawdad
|
3
|
+
|
4
|
+
module Tokens
|
5
|
+
|
6
|
+
def token_type(token)
|
7
|
+
token[0]
|
8
|
+
end
|
9
|
+
|
10
|
+
def box(width, content)
|
11
|
+
[:box, width, content]
|
12
|
+
end
|
13
|
+
|
14
|
+
def box_content(token)
|
15
|
+
token[2]
|
16
|
+
end
|
17
|
+
|
18
|
+
def glue(width, stretch, shrink)
|
19
|
+
[:glue, width, stretch, shrink]
|
20
|
+
end
|
21
|
+
|
22
|
+
def glue_stretch(token)
|
23
|
+
token[2]
|
24
|
+
end
|
25
|
+
|
26
|
+
def glue_shrink(token)
|
27
|
+
token[3]
|
28
|
+
end
|
29
|
+
|
30
|
+
def penalty(penalty, width=0, flagged=false)
|
31
|
+
[:penalty, width, penalty, flagged]
|
32
|
+
end
|
33
|
+
|
34
|
+
def penalty_penalty(token)
|
35
|
+
token[2]
|
36
|
+
end
|
37
|
+
|
38
|
+
def penalty_flagged?(token)
|
39
|
+
token[3]
|
40
|
+
end
|
41
|
+
|
42
|
+
def token_width(token)
|
43
|
+
token[1]
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
metadata
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: crawdad
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Brad Ediger
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-03-19 00:00:00 -05:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Crawdad is an implementation of Knuth-Plass linebreaking (justification) for Ruby.
|
17
|
+
email: brad.ediger@madriska.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions:
|
21
|
+
- Rakefile
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- lib/crawdad
|
26
|
+
- lib/crawdad/prawn_tokenizer.rb
|
27
|
+
- lib/crawdad/breakpoint.rb
|
28
|
+
- lib/crawdad/native.rb
|
29
|
+
- lib/crawdad/ffi.rb
|
30
|
+
- lib/crawdad/paragraph.rb
|
31
|
+
- lib/crawdad/compatibility.rb
|
32
|
+
- lib/crawdad/tokens.rb
|
33
|
+
- lib/crawdad/ffi
|
34
|
+
- lib/crawdad/ffi/paragraph.rb
|
35
|
+
- lib/crawdad/ffi/tokens.rb
|
36
|
+
- lib/crawdad/ffi/breakpoint_node.rb
|
37
|
+
- lib/crawdad.rb
|
38
|
+
- ext/crawdad/breakpoint.h
|
39
|
+
- ext/crawdad/tokens.c
|
40
|
+
- ext/crawdad/paragraph.h
|
41
|
+
- ext/crawdad/Makefile
|
42
|
+
- ext/crawdad/tokens.h
|
43
|
+
- ext/crawdad/paragraph.c
|
44
|
+
- Rakefile
|
45
|
+
has_rdoc: true
|
46
|
+
homepage: http://github.com/madriska/crawdad
|
47
|
+
post_install_message:
|
48
|
+
rdoc_options:
|
49
|
+
- --title
|
50
|
+
- Crawdad Documentation
|
51
|
+
- -q
|
52
|
+
require_paths:
|
53
|
+
- lib
|
54
|
+
- ext
|
55
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: "0"
|
60
|
+
version:
|
61
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: "0"
|
66
|
+
version:
|
67
|
+
requirements: []
|
68
|
+
|
69
|
+
rubyforge_project:
|
70
|
+
rubygems_version: 1.3.1
|
71
|
+
signing_key:
|
72
|
+
specification_version: 2
|
73
|
+
summary: Knuth-Plass linebreaking for Ruby
|
74
|
+
test_files: []
|
75
|
+
|