pdf-extract 0.0.10 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/8630-31489-1-PB.mask.pdf +0 -0
- data/bin/pdf-extract +1 -2
- data/bin/test2.mask.pdf +0 -0
- data/bin/test3.mask.pdf +0 -0
- data/bin/test4.mask.pdf +0 -0
- data/bin/test5.mask.pdf +0 -0
- data/bin/test6.mask.pdf +0 -0
- data/bin/tmp.txt +368 -0
- data/lib/analysis/columns.rb +9 -5
- data/lib/analysis/sections.rb +50 -32
- data/lib/font_metrics.rb +11 -3
- data/lib/language.rb +9 -9
- data/lib/model/chunks.rb +8 -4
- data/lib/model/regions.rb +7 -7
- data/lib/multi_range.rb +13 -3
- data/lib/pdf-extract.rb +0 -2
- data/lib/references/references.rb +16 -15
- data/lib/references/resolve.rb +15 -15
- data/lib/references/score.rb +1 -1
- data/lib/spatial.rb +13 -13
- metadata +77 -134
- data/lib/view/png_view.rb +0 -30
Binary file
|
data/bin/pdf-extract
CHANGED
@@ -22,8 +22,7 @@ resolvers = {
|
|
22
22
|
|
23
23
|
outputs = {
|
24
24
|
:xml => proc { :stdout },
|
25
|
-
:pdf => proc { |f| File::basename(f.sub /\.[a-zA-Z0-9]+\Z/, "") + ".mask.pdf" }
|
26
|
-
:png => proc { |f| File::basename(f.sub /\.[a-zA-Z0-9]+\Z/, "") + ".mask.png" }
|
25
|
+
:pdf => proc { |f| File::basename(f.sub /\.[a-zA-Z0-9]+\Z/, "") + ".mask.pdf" }
|
27
26
|
}
|
28
27
|
|
29
28
|
commands = [
|
data/bin/test2.mask.pdf
ADDED
Binary file
|
data/bin/test3.mask.pdf
ADDED
Binary file
|
data/bin/test4.mask.pdf
ADDED
Binary file
|
data/bin/test5.mask.pdf
ADDED
Binary file
|
data/bin/test6.mask.pdf
ADDED
Binary file
|
data/bin/tmp.txt
ADDED
@@ -0,0 +1,368 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<pdf>
|
3
|
+
<section line_height="7.96" font="TRUPSF+CMR9" letter_ratio="0.06" year_ratio="0.0"
|
4
|
+
cap_ratio="0.2" name_ratio="0.172" word_count="250" lateness="0.125"
|
5
|
+
reference_score="3.94">ABSTRACT Detecting tables in document images is important since not only
|
6
|
+
do tables contain important information, but also most of the layout analysis methods fail in
|
7
|
+
the presence of tables in the document image. Existing approaches for table detection mainly
|
8
|
+
focus on detecting tables in single columns of text and do not work reliably on documents with
|
9
|
+
varying layouts. This paper presents a practical algorithm for table detection that works with a
|
10
|
+
high accuracy on documents with varying layouts (company reports, newspaper articles, magazine
|
11
|
+
pages, . . . ). An open source implementation of the algorithm is provided as part of the
|
12
|
+
Tesseract OCR engine. Evaluation of the algorithm on document images from publicly available
|
13
|
+
UNLV dataset shows competitive performance in comparison to the table detection module of a
|
14
|
+
commercial OCR system. Categories and Subject Descriptors I.7.5 [Document and Text Processing]:
|
15
|
+
Document Capture|Document Analysis Keywords page segmentation, table detection, document
|
16
|
+
analysis 1. INTRODUCTION Automatic conversion of paper documents into an editable electronic
|
17
|
+
representation relies on optical character recognition (OCR) technology. A typical OCR system
|
18
|
+
consists of three major steps. First, layout analysis is performed to locate text-lines in the
|
19
|
+
document image and to identify their reading order. Then, a character recognition engine
|
20
|
+
processes the text-line images and generates a text string by recognizing individual characters
|
21
|
+
in the text-line image. Finally, a language modeling module makes corrections in the text string
|
22
|
+
using a dictionary or a language model. ( The author gratefully acknowledges funding from Google
|
23
|
+
Inc. for supporting this work<component x="53.8" y="466.55" width="239.12" height="167.4"
|
24
|
+
page="1" page_width="595.28" page_height="841.89"></component><component x="53.8" y="419.88"
|
25
|
+
width="239.11" height="31.41" page="1" page_width="595.28"
|
26
|
+
page_height="841.89"></component><component x="53.8" y="383.67" width="219.51" height="20.95"
|
27
|
+
page="1" page_width="595.28" page_height="841.89"></component><component x="53.8" y="225.03"
|
28
|
+
width="239.12" height="143.37" page="1" page_width="595.28"
|
29
|
+
page_height="841.89"></component></section>
|
30
|
+
<section line_height="7.13" font="OFVLTP+NimbusRomNo9L-Regu" letter_ratio="0.14"
|
31
|
+
year_ratio="0.0" cap_ratio="0.15" name_ratio="0.2261904761904762" word_count="84"
|
32
|
+
lateness="0.125" reference_score="4.77">Permission to make digital or hard copies of all or part
|
33
|
+
of this work for personal or classroom use is granted without fee provided that copies are not
|
34
|
+
made or distributed for profit or commercial advantage and that copies bear this notice and the
|
35
|
+
full citation on the first page. To copy otherwise, to republish, to post on servers or to
|
36
|
+
redistribute to lists, requires prior specific permission and/or a fee. DAS '10, June 9-11,
|
37
|
+
2010, Boston, MA, USA Copyright 2010 ACM 978-1-60558-773-8/10/06 ...$10.00<component x="53.8"
|
38
|
+
y="120.67" width="239.1" height="69.89" page="1" page_width="595.28"
|
39
|
+
page_height="841.89"></component></section>
|
40
|
+
<section line_height="7.96" font="TRUPSF+CMR9" letter_ratio="0.05" year_ratio="0.0"
|
41
|
+
cap_ratio="0.08" name_ratio="0.20614035087719298" word_count="1140" lateness="0.375"
|
42
|
+
reference_score="2.03">Since layout analysis is the (rst step in such a process, all subsequent
|
43
|
+
stages rely on layout analysis to work correctly. One of the major challenges faced by layout
|
44
|
+
analysis is detecting table regions. Table detection is a hard problem since tables have a large
|
45
|
+
variation in their layouts. Existing open-source OCR systems lack the capability of table
|
46
|
+
detection and their layout analysis modules break down in the presence of table regions. A
|
47
|
+
distinction should be made at this stage between table detection and table recognition [8].
|
48
|
+
Table detection deals with the problem of (nding boundaries of tables in a page image. Table
|
49
|
+
recognition, on the other hand, focuses on analyzing a detected table by (nding its rows and
|
50
|
+
columns and tries to extract the structure of the table. Our focus in this paper is on the table
|
51
|
+
detection problem. Wang et al. [20] take a statistical learning approach for the table detection
|
52
|
+
problem. Given a set of candidate text-lines, candidate table lines are identi(ed based on gaps
|
53
|
+
between consecutive words. Then, vertically adjacent lines with large gaps and horizontally
|
54
|
+
adjacent words are grouped together to make table entity candidates. Finally, a statistical
|
55
|
+
based learning algorithm is used to re(ne the table candidates and reduce false alarms. They
|
56
|
+
make the assumption that the maximum number of columns is two and design three templates of page
|
57
|
+
layout (single column, double column, mixed column). They apply a column style classi(cation
|
58
|
+
algorithm to (nd out the column layout of the page and use this information as a priori
|
59
|
+
knowledge for spotting table regions. This approach can handle only those layouts on which it
|
60
|
+
has been trained. Besides, training the algorithm requires a large amount of labeled data. Hu et
|
61
|
+
al. [6] presented a system for table detection from scanned page images or from plain text
|
62
|
+
documents. Their system assumes a single-column input page that can be easily segmented into
|
63
|
+
individual text-lines (for instance by horizontal projection). The table detection problem is
|
64
|
+
then posed as an optimization problem where start and end textlines belonging to a table are
|
65
|
+
identi(ed by optimizing some quality function. Like previous approaches, this technique can not
|
66
|
+
be applied to multi-column documents. Cesarini et al. [2] present a system for locating table
|
67
|
+
regions by detecting parallel lines. The table hypothesis formed in this way are then veri(ed by
|
68
|
+
locating perpendicular lines or white spaces in the region included between the parallel lines.
|
69
|
+
However, relying only on horizontal or vertical lines for table detection limits the scope of
|
70
|
+
the system since not all tables have such lines. More recent work in table detection is reported
|
71
|
+
by Gatos et al. [4] and Costa e Silva [3]. Gatos et al. [4] focus on locating tables that have
|
72
|
+
both horizontal and vertical rulings and (nd their intersection points. Then, table
|
73
|
+
reconstruction is achieved by drawing the corresponding horizontal and vertical lines that
|
74
|
+
connect all line intersection pairs. The system works pretty well for their target documents but
|
75
|
+
can not be used when the tables rows/columns are not separated by ruling lines. The work of
|
76
|
+
Costa e Silva [3] focuses on extracting table regions from PDF documents using Hidden Markov
|
77
|
+
Models (HMMs). They extract text from the PDF using pdftotext Linux utility. The spaces in the
|
78
|
+
extracted text are used for computing the feature vector. Clearly, this approach would not work
|
79
|
+
for document images. Summarizing the state of the art in table detection, we can see a clear
|
80
|
+
limitation of existing methods. The methods do not work well on multi-column document images.
|
81
|
+
This is probably due to the fact that most of the existing approaches focus on table recognition
|
82
|
+
to extract the structure (rows, columns, cells) of the tables and hence make some simplifying
|
83
|
+
assumptions on the table detection part. This approach works well when one has to deal with some
|
84
|
+
speci(c classes of document images having simple layouts. However, more robust table detection
|
85
|
+
algorithms are needed when dealing with a heterogeneous collection of documents. In this paper,
|
86
|
+
we try to bridge this gap. Our goal is to accurately spot table regions in complex heterogeneous
|
87
|
+
documents (company reports, journal articles, newspapers, magazines, . . . ). Once table regions
|
88
|
+
are spotted, one of the existing table recognition techniques (e.g. [10]) could be used to
|
89
|
+
extract the structure of the tables. The rest of this paper is organized as follows. First, we
|
90
|
+
describe in Section 2 the layout analysis module of Tesseract [18, 19] that would be used as a
|
91
|
+
basis of our table detection algorithm. Then, our table detection algorithm is illustrated in
|
92
|
+
Section 3. Di(erent performance measures used to evaluate our system are presented in Section 4.
|
93
|
+
Experimental results and discussion is given in Section 5 followed by a conclusion in Section 6.
|
94
|
+
2. LAYOUT ANALYSIS VIA TAB-STOP DETECTION The layout analysis of Tesseract is a recent addition
|
95
|
+
to the open source OCR system [19]. It is based on the idea of detecting tab-stops in a document
|
96
|
+
image. When type-setting a document, tab-stops are the locations where text aligns (left, right,
|
97
|
+
center, decimal, . . . ). Therefore, tab-stops can be used as a reliable indication of where a
|
98
|
+
text block starts or ends. Finding the layout of the page via tab-stop detection proceeds as
|
99
|
+
follows (see Figure 1 for illustration): ( First, a document image pre-processing step is
|
100
|
+
performed to identify horizontal and vertical ruling lines or separators and to locate half-tone
|
101
|
+
or image regions in the document. Then, a connected component analysis is performed to identify
|
102
|
+
candidate text components based on their size and stroke width. ( The (ltered text components
|
103
|
+
are evaluated as candidates for lying on a tab-stop position. These candidates are grouped into
|
104
|
+
vertical lines to (nd tab-stop positions that are vertically aligned. As a (nal step, pairs of
|
105
|
+
connected tab lines are adjusted such that they end at the same y-coordinate (see Figure 1(a)).
|
106
|
+
At this stage, vertical tab lines marks the start and end of text regions. ( Based on the
|
107
|
+
tab-lines, the column layout of the page is inferred and connected components are grouped into
|
108
|
+
Column Partitions. A column partition is a sequence of connected components that do not cross
|
109
|
+
any tab line and are of the same type (text, image, . . . ). Text column partitions can be
|
110
|
+
regarded as initial candidates for text-lines(see Figure 1(b)). ( The last step creates ows of
|
111
|
+
column partitions such that neighboring column partitions of the same type are grouped into the
|
112
|
+
same block (Figure 1(c)). Text column partitions having di(erent font size and line spacing are
|
113
|
+
grouped into di(erent blocks. Then, the reading order of these blocks is identi(ed. The boundary
|
114
|
+
of the blocks is represented as an isothetic polygon (a polygon that has all edges parallel to
|
115
|
+
the axes). 3. TABLE SPOTTING Our table detection algorithm is built upon two components of the
|
116
|
+
layout analysis module:<component x="316.81" y="477.51" width="239.11" height="154.41" page="1"
|
117
|
+
page_width="595.28" page_height="841.89"></component><component x="316.81" y="174.15"
|
118
|
+
width="239.12" height="164.87" page="1" page_width="595.28"
|
119
|
+
page_height="841.89"></component><component x="316.81" y="132.3" width="239.11" height="28.88"
|
120
|
+
page="1" page_width="595.28" page_height="841.89"></component><component x="53.8" y="480.09"
|
121
|
+
width="239.11" height="60.27" page="2" page_width="595.28"
|
122
|
+
page_height="841.89"></component><component x="53.8" y="155.81" width="239.12" height="206.72"
|
123
|
+
page="2" page_width="595.28" page_height="841.89"></component><component x="53.8" y="124.43"
|
124
|
+
width="239.11" height="18.42" page="2" page_width="595.28"
|
125
|
+
page_height="841.89"></component><component x="316.81" y="385.95" width="239.12" height="154.41"
|
126
|
+
page="2" page_width="595.28" page_height="841.89"></component><component x="316.81" y="291.8"
|
127
|
+
width="239.12" height="81.19" page="2" page_width="595.28"
|
128
|
+
page_height="841.89"></component><component x="316.81" y="170.93" width="239.64" height="108.12"
|
129
|
+
page="2" page_width="595.28" page_height="841.89"></component><component x="330.14" y="122.84"
|
130
|
+
width="225.79" height="29.39" page="2" page_width="595.28"
|
131
|
+
page_height="841.89"></component><component x="76.21" y="476.58" width="216.69" height="28.88"
|
132
|
+
page="3" page_width="595.28" page_height="841.89"></component><component x="67.12" y="385.62"
|
133
|
+
width="225.79" height="81.69" page="3" page_width="595.28"
|
134
|
+
page_height="841.89"></component><component x="67.12" y="305.12" width="225.79" height="71.23"
|
135
|
+
page="3" page_width="595.28" page_height="841.89"></component><component x="67.12" y="214.16"
|
136
|
+
width="225.79" height="81.69" page="3" page_width="595.28"
|
137
|
+
page_height="841.89"></component><component x="53.8" y="168.72" width="239.11" height="31.41"
|
138
|
+
page="3" page_width="595.28" page_height="841.89"></component></section>
|
139
|
+
<section line_height="7.96" font="TRUPSF+CMR9" letter_ratio="0.1" year_ratio="0.0"
|
140
|
+
cap_ratio="0.16" name_ratio="0.20500894454382826" word_count="2795" lateness="1.0"
|
141
|
+
reference_score="9.47">3.1 Identifying Table Partitions The (rst step in our algorithm identi(es
|
142
|
+
text column partitions that could belong to a table region, referred to as table partitions.
|
143
|
+
Based on the observations mentioned in the previous paragraph, three types of partitions are
|
144
|
+
marked as table partitions: (1) partitions that have at lease one large gap between their
|
145
|
+
connected components, (2) partitions that consist of only one word (no signi(cant gap between
|
146
|
+
components), (3) partitions that overlap along the y-axis with other partitions within the same
|
147
|
+
column. The (rst case identi(es table partitions that result from merging cells from di(erent
|
148
|
+
columns of a table into one partition. The second case detects table partitions that consists of
|
149
|
+
a single data cell. The third case identi(es table partitions that lie in one column but were
|
150
|
+
not joined together due to the presence of a strong tab-line. This stage tries to (nd table
|
151
|
+
partition candidates quite aggressively. This has the advantage that even small evidence of the
|
152
|
+
presence of a table is not missed, since any tables that are missed at this stage will not be
|
153
|
+
recoverable at later stages. The disadvantage of the aggressive approach is that several false
|
154
|
+
alarms may originate, for instance from single word section headings, page headers and footers,
|
155
|
+
numbered equations, small parts of text words in the marginal noise, and line drawing regions. A
|
156
|
+
smoothing (lter is applied that detects isolated table partitions that have no other table
|
157
|
+
partition neighbor above or below them. These partitions are removed from the candidate table
|
158
|
+
partition list. The candidate table partitions for our example image are shown in Figure 3(a).
|
159
|
+
3.2 Detecting Page Column Split The next step is to detect split in the column layout of the
|
160
|
+
page due to the presence of a table. Such a split occurs when the cells of the table are very
|
161
|
+
well aligned. To detect this case, we divide the page into columns and (nd the ratio of table
|
162
|
+
partitions in each column. Table columns that were erroneously reported as page columns are
|
163
|
+
easily detected since they have a high ratio of table partition as compared to normal text
|
164
|
+
partitions. However, extra care needs to be taken at this stage to undo a column split (i.e. to
|
165
|
+
merge two columns) since a wrong decision would result in merging two text columns leading to a
|
166
|
+
large numbers of errors in page layout analysis itself. Therefore, we undo a page column split
|
167
|
+
only if su(cient number of text partitions spanning the two columns are present and the split in
|
168
|
+
the columns starts with table partitions. This extra care prevents merging table columns in
|
169
|
+
full-page tables when there is no owing text in the page. Since the cost of a wrong decision
|
170
|
+
here is very high in terms of layout analysis errors we chose to perform this step defensively.
|
171
|
+
3.3 Locating Table Columns The goal of this step is to group table partitions into table
|
172
|
+
columns. For this purpose, runs of vertically neighboring table partitions are assigned to a
|
173
|
+
single table column. If a column partition of type \horizontal ruling" is encountered, the run
|
174
|
+
continues. When a partition of any other type is found, the table column obtained so far is
|
175
|
+
(nalized. If a table column consists of only one table partition, it is removed as a false
|
176
|
+
alarm. The identi(ed table columns for the example image are shown in Figure 3(b). 3.4 Marking
|
177
|
+
Table Regions Table columns obtained in the previous steps give a strong hint about the presence
|
178
|
+
of a table in that region. We make a simple assumption here: within a single page column, owing
|
179
|
+
text does not share space with a table along the y-axis. This assumption holds true for most of
|
180
|
+
the layouts that we encounter in practice since if a table shares space vertically with owing
|
181
|
+
text, it is hard to see whether the text belongs to the table or not. Based on this assumption,
|
182
|
+
we horizontally expand the boundaries of table columns to the page columns that contain them.
|
183
|
+
Hence we obtain with-in column table regions for each page column. At this stage, tables that
|
184
|
+
are laid out within one column are correctly identi(ed. However, tables spanning multiple page
|
185
|
+
columns are over-segmented. Although two table regions in neighboring page columns could be
|
186
|
+
merged if their start and end positions align, this might wrongly merge di(erent tables in the
|
187
|
+
two columns. Therefore a merge is carried out only if at least one column partition of any type
|
188
|
+
(text, table, horizontal ruling) is found that overlaps with both tables. Table partitions and
|
189
|
+
horizontal ruling partitions that are not included in any table and are directly above or below
|
190
|
+
a table region with a large overlap along the x-axis are also included in the neighboring table.
|
191
|
+
The table regions thus obtained for the example image are shown in Figure 3(c). 3.5 Removing
|
192
|
+
False Alarms Although most of the false alarms originating from normal text regions are removed
|
193
|
+
in previous stages, other sources of false alarms like marginal noise [17] and (gures still
|
194
|
+
remain. Therefore the identi(ed table regions are passed through a simple validity test: a valid
|
195
|
+
table should have at least two columns. False alarms consisting of a single column are removed
|
196
|
+
by analyzing their projection on the x-axis. Projection of a valid table on the x-axis should
|
197
|
+
have at least one zero-valley larger than the global median x-height of the page. Therefore,
|
198
|
+
table candidates that do not have a zero-valley in their vertical projection are removed. 4.
|
199
|
+
PERFORMANCE MEASURES Di(erent performance measures have been reported in the literature for
|
200
|
+
evaluating table detection algorithms. These range from simple precision and recall based
|
201
|
+
measures [6, 13] to more sophisticated measures for benchmarking complete table structure
|
202
|
+
extraction algorithms [8]. In this paper, since we are only focusing on table spotting, we use
|
203
|
+
standard measures for document image segmentation focusing on the table regions. Hence in
|
204
|
+
accordance with [13, 14, 16, 20] we use several measures for quantitatively evaluating di(erent
|
205
|
+
aspects of our table spotting algorithm. Both ground-truth tables and tables detected by our
|
206
|
+
algorithm are represented by their bounding boxes. Let G repi resent the bounding box of ith
|
207
|
+
ground-truth table and D j represent the bounding box of the jth detected table in a document
|
208
|
+
image. The amount of overlap between the two is de(ned as: 2jG \ D j ij (1) A(G ; D ) =ij i j jG
|
209
|
+
j + jD j where jG \ D j represents the area of intersection of the ij two zones, and jG j; jD j
|
210
|
+
represent the individual areas of ij the ground-truth and the detected tables. The amount of
|
211
|
+
area overlap A will vary between zero and one depending on the overlap between ground-truth
|
212
|
+
table G and detected i table D . If the two tables do not overlap at all A = 0, and j ij ij if
|
213
|
+
the two tables match perfectly i.e. jG \D j = jG j = jD j, then A = 1. ( Partial Detections:
|
214
|
+
These are the number of groundtruth tables that have a one-to-one correspondence with a detected
|
215
|
+
table, however the amount of overlap is not large enough (0:1 < A < 0:9) to be classi(ed
|
216
|
+
as a correct detection (see Figure 4(a)). ( Over-Segmented Tables: These are the number of
|
217
|
+
ground-truth tables that have a major overlap (0:1 < A < 0:9) with more than one detected
|
218
|
+
tables. This indicates that di(erent parts of the ground-truth table were detected as separate
|
219
|
+
tables (see Figure 4(b)). ( Missed Tables: These are the number of groundtruth tables that do
|
220
|
+
not have a major overlap with any of the detected tables (A ( 0:1). These tables are regarded as
|
221
|
+
missed by the detection algorithm. ( False Positive Detections: These are the number of detected
|
222
|
+
tables that do not have a major overlap with any of the ground-truth tables (A ( 0:1). These
|
223
|
+
tables are regarded as false positive detections since the system mistook some non-table region
|
224
|
+
as a table (see Figure 4(d)). ( Area Precision: While the measures de(ned above help in
|
225
|
+
understanding which types of errors were made by the table detection algorithm, the goal of this
|
226
|
+
measure is to summarize the performance of the algorithm by measuring what percentage of the
|
227
|
+
detected table regions actually belong to a table region in the groundtruth image. A high
|
228
|
+
precision is achieved when the decision about the presence of a table region is made very
|
229
|
+
conservatively. ( Area Recall: This measure evaluates the percentage of the ground-truth table
|
230
|
+
regions that was marked as belonging to a table by the algorithm. The concept of precision and
|
231
|
+
recall measures are similar to their use in the information retrieval community [13]. 5.
|
232
|
+
EXPERIMENTS AND RESULTS To evaluate the performance of our table detection algorithm, we chose
|
233
|
+
the UNLV dataset [1]. The UNLV dataset contains a large variety of documents ranging from
|
234
|
+
technical reports and business letters to newspapers and magazines. The dataset was speci(cally
|
235
|
+
created to analyze the performance of leading commercial OCR systems in the UNLV annual tests of
|
236
|
+
OCR accuracy [15]. It contains more than 10,000 scanned pages at di(erent resolutions and 1000
|
237
|
+
fax documents. The scanned pages are categorized into bi-tonal and greyscale documents. The
|
238
|
+
bi-tonal documents are again grouped into di(erent scan resolutions (200, 300, and 400 dpi). For
|
239
|
+
each page, manually-keyed ground-truth text is provided, along with manually-determined zone
|
240
|
+
information. The zones are further labeled according to their contents (text, table, half-tone,
|
241
|
+
. . . ). We picked bi-tonal documents in the 300 dpi class for our experiments since this
|
242
|
+
represents the most common settings for scanning documents. Among these images, 427 pages
|
243
|
+
containing table zones were selected. These page images were further split into a training set
|
244
|
+
of 213 images and a test set of 214 images. The training images were used in the development of
|
245
|
+
the algorithm and di(erent steps of the algorithm were extensively evaluated on these images.
|
246
|
+
The test images were used in the end to evaluate the complete system. Results of our table
|
247
|
+
detection algorithm on some sample images from the UNLV dataset are shown in Figure 5. Detailed
|
248
|
+
evaluation of the algorithm and its comparison with a state-of-the-art commercial OCR system is
|
249
|
+
given in Table 1 and Figure 6.It should be noted that the ground-truth table zones provided with
|
250
|
+
the UNLV dataset also include the table caption inside the zone. Since table caption is not a
|
251
|
+
tabular structure, it is left out of the table by all OCR systems. Therefore, we edited the
|
252
|
+
ground-truth information by manually marking the table caption regions in all documents. Then
|
253
|
+
this region was excluded from the ground-truth table zones provided with the dataset. This was
|
254
|
+
achieved by shrinking the ground-truth table zones to tightly enclose all foreground pixels that
|
255
|
+
were not part of the table caption. The experimental results show that our system was able to
|
256
|
+
spot table regions with a precision of 86% on the test data. The recall was also quite high
|
257
|
+
(79%) showing a good compromise between precision and recall. The commercial OCR system, on the
|
258
|
+
other hand, had a lower recall (37%) but higher precision (96%). Figure 6: A bar chart of the
|
259
|
+
accuracy of the proposed table detection system with that of a commercial OCR on UNLV test set
|
260
|
+
(214 page containing 268 tables). Some of the errors made by our algorithm are shown in Figure
|
261
|
+
4. An analysis of the results shows that the major source of errors are full-page tables. In
|
262
|
+
these cases, the column (nding algorithm reports several columns of text. Since newspapers also
|
263
|
+
have several text columns, without using a priori knowledge about the type of documents (report,
|
264
|
+
newspaper, . . . ) it is hard to detect that the large number of columns are due to a full-page
|
265
|
+
table. One typical example is a page containing \table of contents". Such pages are marked as
|
266
|
+
table regions in the ground-truth information provided with the UNLV dataset. However, our
|
267
|
+
algorithm regards them as regular text pages hence either missing these \tables" completely or
|
268
|
+
partially detecting them. The false positive detection made by our algorithm were also analyzed.
|
269
|
+
We noticed an interesting side-e(ect of our algorithm. Since many graphics regions have text
|
270
|
+
inside them that is spaced apart, such regions were also spotted as tables. Although such cases
|
271
|
+
were reported as false alarms, in some cases it might be bene(cial to additionally spot graphics
|
272
|
+
regions as well. Other cases of false alarms originated from tabulated equations. False alarms
|
273
|
+
in pure text regions were quite rare. 6. CONCLUSION This paper presented a table detection
|
274
|
+
algorithm as part of the Tesseract open source OCR system. The presented algorithm uses
|
275
|
+
components of the layout analysis module of Tesseract to locate tables in documents having a
|
276
|
+
large variety of layouts. Experimental results on di(erent classes of documents (company
|
277
|
+
reports, journal articles, newspaper articles, magazine pages) from the UNLV dataset showed that
|
278
|
+
our table detection algorithm competes well with that of a commercial OCR system with a much
|
279
|
+
higher recall and slightly lower precision. We plan to extend this work in the direction of
|
280
|
+
table structure extraction in future. Figure 5: Some sample images from the UNLV dataset showing
|
281
|
+
the table spotting results of our algorithm. 7. REFERENCES [1]
|
282
|
+
http://www.isri.unlv.edu/ISRI/OCRtk. [2] F. Cesarini, S. Marinai, L. Sarti, and G. Soda.
|
283
|
+
Trainable table location in document images. In Proc. Int. Conf. on Pattern Recognition, pages
|
284
|
+
236{240, Quebec, Canada, Aug. 2002. [3] A. C. e Silva. Learning rich hidden markov models in
|
285
|
+
document analysis: Table location. In Proc. Int. Conf. on Document Analysis and Recognition,
|
286
|
+
pages 843{847, Barcelona, Spain, July 2009. [4] B. Gatos, D. Danatsas, I. Pratikakis, and S. J.
|
287
|
+
Perantonis. Automatic table detection in document images. In Proc. Int. Conf. on Advances in
|
288
|
+
Pattern Recognition, pages 612{621, Path, UK, Aug. 2005. [5] I. Guyon, R. M. Haralick, J. J.
|
289
|
+
Hull, and I. T. Phillips. Data sets for OCR and document image understanding research. In H.
|
290
|
+
Bunke and P. Wang, editors, Handbook of character recognition and document image analysis, pages
|
291
|
+
779{799. World Scienti(c, Singapore, 1997. [6] J. Hu, R. Kashi, D. Lopresti, and G. Wilfong.
|
292
|
+
Medium-independent table detection. In Proc. SPIE Document Recognition and Retrieval VII, pages
|
293
|
+
291{302, San Jose, CA, USA, Jan. 2000. [7] J. Hu, R. S. Kashi, D. Lopresti, and G. Wilfong.
|
294
|
+
Experiments in table recognition. In Proc. Int. Workshop on Document Layout Interpretation and
|
295
|
+
Applications, Seattle, WA, USA, Sep. 2001. [8] J. Hu, R. S. Kashi, D. Lopresti, and G. Wilfong.
|
296
|
+
Evaluating the performance of table processing algorithms. Int. Jour. on Document Analysis and
|
297
|
+
Recognition, 4(3):140{153, 2002. [9] D. Keysers, F. Shafait, and T. M. Breuel. Document image
|
298
|
+
zone classi(cation - a simple high-performance approach. In 2nd Int. Conf. on Computer Vision
|
299
|
+
Theory and Applications, pages 44{51, Barcelona, Spain, Mar. 2007. [10] T. Kieninger and A.
|
300
|
+
Dengel. A paper-to-HTML table converting system. In Proc. Document Analysis Systems, pages
|
301
|
+
356{365, Nagano, Japan, Nov. 1998. [11] T. Kieninger and A. Dengel. Table recognition and
|
302
|
+
labeling using intrinsic layout features. In Proc. Int. Conf. on Advances in Pattern
|
303
|
+
Recognition, Plymouth, UK, Nov. 1998. [12] T. Kieninger and A. Dengel. Applying the T-RECS table
|
304
|
+
recognition system to the business letter domain. In Proc. Int. Conf. on Document Analysis and
|
305
|
+
Recognition, pages 518{522, Seattle, WA, USA, Sep. 2001. [13] T. Kieninger and A. Dengel. An
|
306
|
+
approach towards benchmarking of table structure recognition results. In Proc. 8th Int. Conf. on
|
307
|
+
Document Analysis and Recognition, pages 1232{1236, Seoul, Korea, Aug. 2005. [14] S. Mandal, S.
|
308
|
+
Chowdhury, A. Das, and B. Chanda. A simple and e(ective table detection system from document
|
309
|
+
images. Int. Jour. on Document Analysis and Recognition, 8(2-3):172{182, 2006. [15] S. V. Rice,
|
310
|
+
F. R. Jenkins, and T. A. Nartker. The fourth annual test of OCR accuracy. Technical report,
|
311
|
+
Information Science Research Institute, University of Nevada, Las Vegas, 1995. [16] F. Shafait,
|
312
|
+
D. Keysers, and T. M. Breuel. Performance evaluation and benchmarking of six page segmentation
|
313
|
+
algorithms. IEEE Trans. on Pattern Analysis and Machine Intelligence, 30(6):941{954, 2008. [17]
|
314
|
+
F. Shafait, J. van Beusekom, D. Keysers, and T. M. Breuel. Document cleanup using page frame
|
315
|
+
detection. Int. Jour. on Document Analysis and Recognition, 11(2):81{96, 2008. [18] R. Smith. An
|
316
|
+
overview of the Tesseract OCR engine. In Proc. 9th Int. Conf. on Document Analysis and
|
317
|
+
Recognition, pages 629{633, Curitiba, Brazil, Sep. 2007. [19] R. Smith. Hybrid page layout
|
318
|
+
analysis via tab-stop detection. In Proc. Int. Conf. on Document Analysis and Recognition, pages
|
319
|
+
241{245, Barcelona, Spain, July 2009. [20] Y. Wang, R. Haralick, and I. T. Phillips. Automatic
|
320
|
+
table ground truth generation and a background-analysis-based table structure extraction method.
|
321
|
+
In Proc. Int. Conf. on Document Analysis and Recognition, pages 528{532, Seattle, WA, USA, Sep.
|
322
|
+
2001. [21] Y. Wang, I. Phillips, and R. Haralick. Document zone content classi(cation and its
|
323
|
+
performance evaluation. Pattern Recognition, 39(1):57{73, 2006.<component x="316.81" y="122.84"
|
324
|
+
width="239.11" height="83.71" page="3" page_width="595.28"
|
325
|
+
page_height="841.89"></component><component x="53.8" y="455.66" width="239.12" height="81.19"
|
326
|
+
page="4" page_width="595.28" page_height="841.89"></component><component x="53.8" y="298.75"
|
327
|
+
width="239.12" height="143.95" page="4" page_width="595.28"
|
328
|
+
page_height="841.89"></component><component x="53.8" y="143.76" width="239.12" height="136.02"
|
329
|
+
page="4" page_width="595.28" page_height="841.89"></component><component x="53.8" y="122.84"
|
330
|
+
width="239.11" height="7.96" page="4" page_width="595.28"
|
331
|
+
page_height="841.89"></component><component x="316.81" y="466.12" width="239.11" height="70.73"
|
332
|
+
page="4" page_width="595.28" page_height="841.89"></component><component x="316.81" y="346.79"
|
333
|
+
width="239.12" height="104.63" page="4" page_width="595.28"
|
334
|
+
page_height="841.89"></component><component x="316.81" y="206.53" width="239.11" height="125.55"
|
335
|
+
page="4" page_width="595.28" page_height="841.89"></component><component x="316.81" y="122.84"
|
336
|
+
width="239.11" height="70.73" page="4" page_width="595.28"
|
337
|
+
page_height="841.89"></component><component x="53.8" y="506.1" width="239.12" height="60.27"
|
338
|
+
page="5" page_width="595.28" page_height="841.89"></component><component x="53.8" y="366.62"
|
339
|
+
width="239.11" height="125.55" page="5" page_width="595.28"
|
340
|
+
page_height="841.89"></component><component x="53.8" y="237.59" width="239.11" height="115.09"
|
341
|
+
page="5" page_width="595.28" page_height="841.89"></component><component x="53.8" y="122.42"
|
342
|
+
width="239.11" height="102.21" page="5" page_width="595.28"
|
343
|
+
page_height="841.89"></component><component x="316.81" y="495.64" width="239.11" height="71.23"
|
344
|
+
page="5" page_width="595.28" page_height="841.89"></component><component x="330.14" y="388.41"
|
345
|
+
width="226.7" height="50.31" page="5" page_width="595.28"
|
346
|
+
page_height="841.89"></component><component x="330.14" y="329.86" width="225.78" height="50.31"
|
347
|
+
page="5" page_width="595.28" page_height="841.89"></component><component x="330.14" y="191.85"
|
348
|
+
width="225.79" height="39.85" page="5" page_width="595.28"
|
349
|
+
page_height="841.89"></component><component x="330.14" y="122.84" width="225.79" height="60.77"
|
350
|
+
page="5" page_width="595.28" page_height="841.89"></component><component x="67.12" y="692.7"
|
351
|
+
width="225.78" height="92.15" page="6" page_width="595.28"
|
352
|
+
page_height="841.89"></component><component x="67.12" y="625.6" width="225.78" height="50.31"
|
353
|
+
page="6" page_width="595.28" page_height="841.89"></component><component x="53.8" y="342.52"
|
354
|
+
width="239.32" height="261.54" page="6" page_width="595.28"
|
355
|
+
page_height="841.89"></component><component x="53.8" y="122.84" width="239.11" height="206.72"
|
356
|
+
page="6" page_width="595.28" page_height="841.89"></component><component x="316.81" y="536.37"
|
357
|
+
width="239.11" height="39.34" page="6" page_width="595.28"
|
358
|
+
page_height="841.89"></component><component x="316.81" y="376.11" width="239.11" height="133.49"
|
359
|
+
page="6" page_width="595.28" page_height="841.89"></component><component x="316.81" y="271.5"
|
360
|
+
width="239.11" height="91.65" page="6" page_width="595.28"
|
361
|
+
page_height="841.89"></component><component x="316.81" y="122.84" width="239.11" height="125.56"
|
362
|
+
page="6" page_width="595.28" page_height="841.89"></component><component x="55.39" y="134.11"
|
363
|
+
width="498.95" height="7.96" page="7" page_width="595.28"
|
364
|
+
page_height="841.89"></component><component x="53.8" y="124.03" width="237.98" height="481.22"
|
365
|
+
page="8" page_width="595.28" page_height="841.89"></component><component x="316.81" y="157.4"
|
366
|
+
width="239.11" height="445.82" page="8" page_width="595.28"
|
367
|
+
page_height="841.89"></component></section>
|
368
|
+
</pdf>
|
data/lib/analysis/columns.rb
CHANGED
@@ -28,14 +28,14 @@ module PdfExtract
|
|
28
28
|
def self.include_in pdf
|
29
29
|
deps = [:regions, :bodies]
|
30
30
|
pdf.spatials :columns, :paged => true, :depends_on => deps do |parser|
|
31
|
-
|
31
|
+
|
32
32
|
body = nil
|
33
33
|
body_regions = []
|
34
34
|
|
35
35
|
parser.before do
|
36
36
|
body_regions = []
|
37
37
|
end
|
38
|
-
|
38
|
+
|
39
39
|
parser.objects :bodies do |b|
|
40
40
|
body = b
|
41
41
|
end
|
@@ -48,7 +48,7 @@ module PdfExtract
|
|
48
48
|
|
49
49
|
parser.after do
|
50
50
|
column_sample_count = pdf.settings[:column_sample_count]
|
51
|
-
|
51
|
+
|
52
52
|
step = 1.0 / (column_sample_count + 1)
|
53
53
|
column_ranges = []
|
54
54
|
|
@@ -59,10 +59,14 @@ module PdfExtract
|
|
59
59
|
|
60
60
|
# Discard those with a coverage of 0.
|
61
61
|
column_ranges.reject! { |r| r.covered.zero? }
|
62
|
-
|
62
|
+
|
63
63
|
# Discard those with more than x columns. They've probably hit a table.
|
64
64
|
column_ranges.reject! { |r| r.count > pdf.settings[:max_column_count] }
|
65
65
|
|
66
|
+
# Discard ranges that comprise only of very narrow columns.
|
67
|
+
# Likely tables or columns picking up on false tab stops.
|
68
|
+
column_ranges.reject! { |r| r.widest < (0.25 * body[:width]) }
|
69
|
+
|
66
70
|
if column_ranges.count.zero?
|
67
71
|
[]
|
68
72
|
else
|
@@ -79,7 +83,7 @@ module PdfExtract
|
|
79
83
|
end
|
80
84
|
end
|
81
85
|
end
|
82
|
-
|
86
|
+
|
83
87
|
end
|
84
88
|
end
|
85
89
|
|
data/lib/analysis/sections.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
require_relative '../language'
|
2
3
|
require_relative '../spatial'
|
3
4
|
require_relative '../kmeans'
|
@@ -10,16 +11,19 @@ module PdfExtract
|
|
10
11
|
:module => self.name,
|
11
12
|
:description => "Minimum ratio of text region width to containing column width for a text region to be considered as part of an article section."
|
12
13
|
}
|
13
|
-
|
14
|
+
|
14
15
|
def self.match? a, b
|
15
|
-
|
16
|
-
|
17
|
-
|
16
|
+
# A must have a width around the width of B and have the same
|
17
|
+
# font size.
|
18
|
+
avg_width = (a[:width] + b[:width]) / 2.0
|
19
|
+
matched_width = (a[:width] - b[:width]).abs <= avg_width * 0.1
|
20
|
+
matched_font_size = a[:line_height].round(2) == b[:line_height].round(2)
|
21
|
+
matched_width && matched_font_size
|
18
22
|
end
|
19
23
|
|
20
24
|
def self.candidate? pdf, region, column
|
21
25
|
# Regions that make up sections or headers must be
|
22
|
-
# both less
|
26
|
+
# both less wide than their column width and,
|
23
27
|
# unless they are a single line, must be within the
|
24
28
|
# width_ratio.
|
25
29
|
width_ratio = pdf.settings[:width_ratio]
|
@@ -27,13 +31,23 @@ module PdfExtract
|
|
27
31
|
within_column && (region[:width].to_f / column[:width]) >= width_ratio
|
28
32
|
end
|
29
33
|
|
34
|
+
def self.possible_header? pdf, region, column
|
35
|
+
# Possible headers are narrower than the column width_ratio
|
36
|
+
# but still within the column bounds. They must also be at least
|
37
|
+
# as wide as they are tall (otherwise we may have a table
|
38
|
+
# column, which should be ignored for purposes of determing
|
39
|
+
# page flow).
|
40
|
+
within_column = region[:width] <= column[:width]
|
41
|
+
within_column && (region[:width] >= region[:height])
|
42
|
+
end
|
43
|
+
|
30
44
|
def self.reference_cluster clusters
|
31
45
|
# Find the cluster with name_ratio closest to 0.1
|
32
46
|
# Those are our reference sections.
|
33
47
|
ideal = 0.1
|
34
48
|
ref_cluster = nil
|
35
49
|
smallest_diff = 1
|
36
|
-
|
50
|
+
|
37
51
|
clusters.each do |cluster|
|
38
52
|
diff = (cluster[:centre][:name_ratio] - ideal).abs
|
39
53
|
if diff < smallest_diff
|
@@ -63,29 +77,29 @@ module PdfExtract
|
|
63
77
|
:letter_ratio => Language.letter_ratio(content),
|
64
78
|
:year_ratio => Language.year_ratio(content),
|
65
79
|
:cap_ratio => Language.cap_ratio(content),
|
66
|
-
:name_ratio => Language.name_ratio(content),
|
80
|
+
:name_ratio => Language.name_ratio(content),
|
67
81
|
:word_count => Language.word_count(content),
|
68
|
-
:lateness => (last_page / page_count.to_f)
|
82
|
+
:lateness => (last_page / page_count.to_f)
|
69
83
|
})
|
70
84
|
end
|
71
85
|
end
|
72
|
-
|
86
|
+
|
73
87
|
def self.include_in pdf
|
74
88
|
pdf.spatials :sections, :depends_on => [:regions, :columns] do |parser|
|
75
89
|
|
76
90
|
columns = []
|
77
|
-
|
91
|
+
|
78
92
|
parser.objects :columns do |column|
|
79
|
-
|
93
|
+
columns << {:column => column, :regions => []}
|
80
94
|
end
|
81
95
|
|
82
96
|
parser.objects :regions do |region|
|
83
97
|
containers = columns.reject do |c|
|
84
98
|
column = c[:column]
|
85
|
-
not (column[:page] == region[:page] && Spatial.contains?(column, region))
|
99
|
+
not (column[:page] == region[:page] && Spatial.contains?(column, region, 1))
|
86
100
|
end
|
87
101
|
|
88
|
-
containers.first[:regions] << region unless containers.
|
102
|
+
containers.first[:regions] << region unless containers.empty?
|
89
103
|
end
|
90
104
|
|
91
105
|
parser.after do
|
@@ -107,36 +121,40 @@ module PdfExtract
|
|
107
121
|
end
|
108
122
|
|
109
123
|
sections = []
|
110
|
-
|
111
|
-
|
124
|
+
merging_region = nil
|
125
|
+
|
112
126
|
pages.each_pair do |page, columns|
|
113
|
-
columns.each do |
|
114
|
-
column =
|
115
|
-
|
116
|
-
|
117
|
-
|
127
|
+
columns.each do |container|
|
128
|
+
column = container[:column]
|
129
|
+
|
130
|
+
container[:regions].each do |region|
|
118
131
|
if candidate? pdf, region, column
|
119
|
-
if !
|
120
|
-
content = Spatial.merge_lines(
|
121
|
-
|
132
|
+
if !merging_region.nil? && match?(merging_region, region)
|
133
|
+
content = Spatial.merge_lines(merging_region, region, {})
|
134
|
+
|
135
|
+
merging_region.merge!(content)
|
122
136
|
|
123
|
-
|
124
|
-
|
137
|
+
merging_region[:components] << Spatial.get_dimensions(region)
|
138
|
+
elsif !merging_region.nil?
|
139
|
+
sections << merging_region
|
140
|
+
merging_region = region.merge({
|
141
|
+
:components => [Spatial.get_dimensions(region)]
|
142
|
+
})
|
125
143
|
else
|
126
|
-
|
144
|
+
merging_region = region.merge({
|
127
145
|
:components => [Spatial.get_dimensions(region)]
|
128
146
|
})
|
129
147
|
end
|
130
|
-
|
131
|
-
|
132
|
-
|
148
|
+
elsif possible_header? pdf, region, column
|
149
|
+
# Split sections, ignore the header
|
150
|
+
sections << merging_region if !merging_region.nil?
|
151
|
+
merging_region = nil
|
133
152
|
end
|
134
|
-
|
135
153
|
end
|
136
154
|
end
|
137
155
|
end
|
138
156
|
|
139
|
-
sections
|
157
|
+
sections << merging_region if not merging_region.nil?
|
140
158
|
|
141
159
|
# We now have sections. Add information to them.
|
142
160
|
# add_content_types sections
|
@@ -155,7 +173,7 @@ module PdfExtract
|
|
155
173
|
|
156
174
|
sections
|
157
175
|
end
|
158
|
-
|
176
|
+
|
159
177
|
end
|
160
178
|
end
|
161
179
|
|