biblicit 2.0.3 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +0 -2
- data/biblicit.gemspec +1 -1
- data/parscit/bin/citeExtract.pl +9 -161
- data/parscit/bin/sectExtract.pl +0 -14
- data/parscit/lib/ParsCit/Controller.pm +0 -59
- data/parscit/lib/ParsCit/PreProcess.pm +0 -4
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1 -7
- metadata +4 -24
- data/parscit/bin/sectLabel/processOmniXML.pl +0 -1427
- data/parscit/bin/sectLabel/processOmniXML_new.pl +0 -1025
- data/parscit/bin/sectLabel/processOmniXMLv2.pl +0 -1529
- data/parscit/bin/sectLabel/processOmniXMLv3.pl +0 -964
- data/parscit/bin/sectLabel/simplifyOmniXML.pl +0 -382
- data/parscit/bin/xml2train.pl +0 -193
- data/parscit/lib/Omni/Config.pm +0 -93
- data/parscit/lib/Omni/Omnicell.pm +0 -263
- data/parscit/lib/Omni/Omnicol.pm +0 -292
- data/parscit/lib/Omni/Omnidd.pm +0 -328
- data/parscit/lib/Omni/Omnidoc.pm +0 -153
- data/parscit/lib/Omni/Omniframe.pm +0 -223
- data/parscit/lib/Omni/Omniline.pm +0 -423
- data/parscit/lib/Omni/Omnipage.pm +0 -282
- data/parscit/lib/Omni/Omnipara.pm +0 -232
- data/parscit/lib/Omni/Omnirun.pm +0 -303
- data/parscit/lib/Omni/Omnitable.pm +0 -336
- data/parscit/lib/Omni/Omniword.pm +0 -162
- data/parscit/lib/Omni/Traversal.pm +0 -313
- data/parscit/lib/SectLabel/AAMatching.pm +0 -1949
|
@@ -1,313 +0,0 @@
|
|
|
1
|
-
package Omni::Traversal;
|
|
2
|
-
|
|
3
|
-
# Configuration
|
|
4
|
-
use strict;
|
|
5
|
-
|
|
6
|
-
# Local libraries
|
|
7
|
-
use Omni::Config;
|
|
8
|
-
|
|
9
|
-
# Omnilib configuration: object name
|
|
10
|
-
my $obj_list = $Omni::Config::obj_list;
|
|
11
|
-
|
|
12
|
-
###
|
|
13
|
-
# 16 Feb 2010: Do Hoang Nhat Huy
|
|
14
|
-
# Omni lib is used to handle the Omnipage XML in a generic
|
|
15
|
-
# and 'elegent' way. The hierachical structure of the classes
|
|
16
|
-
# reflects the XML tree as follow
|
|
17
|
-
# Omnidoc
|
|
18
|
-
# |
|
|
19
|
-
# Omnipage (obviously, one document can have many pages)
|
|
20
|
-
# |__________________________________
|
|
21
|
-
# || ||
|
|
22
|
-
# Omnicol (columns in a page) Omnidd (image?)
|
|
23
|
-
# | |
|
|
24
|
-
# | Omnidd (nested)
|
|
25
|
-
# |_________________________________|
|
|
26
|
-
# | |
|
|
27
|
-
# Omnipara (a paragraph) Omnitable (a table)
|
|
28
|
-
# | |
|
|
29
|
-
# | Omnicell (a cell)
|
|
30
|
-
# | |
|
|
31
|
-
# | Omnipara (a paragraph)
|
|
32
|
-
# |_________________________________|
|
|
33
|
-
# |
|
|
34
|
-
# Omniline (a line)
|
|
35
|
-
# |
|
|
36
|
-
# Omnirun (a run, text of the same format)
|
|
37
|
-
# |
|
|
38
|
-
# Omniword (an individual word)
|
|
39
|
-
#
|
|
40
|
-
# This module provide a generic way to travel the whole tree
|
|
41
|
-
# to access each line of the document. Each line will have a
|
|
42
|
-
# group of identification (index) as follow
|
|
43
|
-
# page id;
|
|
44
|
-
# column id or dd id;
|
|
45
|
-
# column id or dd id;*
|
|
46
|
-
# table id;
|
|
47
|
-
# cell id;
|
|
48
|
-
# para id;
|
|
49
|
-
# line id;
|
|
50
|
-
# *: both dd and column can be nested inside each other
|
|
51
|
-
###
|
|
52
|
-
|
|
53
|
-
###
|
|
54
|
-
# Huydhn: collect lines whose addresses are selected
|
|
55
|
-
###
|
|
56
|
-
sub OmniCollector
|
|
57
|
-
{
|
|
58
|
-
my ($doc, $line_addrs, $need_obj) = @_;
|
|
59
|
-
|
|
60
|
-
# All the line
|
|
61
|
-
my @line_content = ();
|
|
62
|
-
|
|
63
|
-
# Check the validity
|
|
64
|
-
if (scalar(@{ $line_addrs }) == 0) { return (\@line_content); }
|
|
65
|
-
|
|
66
|
-
# Current position
|
|
67
|
-
my %current = ();
|
|
68
|
-
#
|
|
69
|
-
my $addr_index = 0;
|
|
70
|
-
|
|
71
|
-
# All pages in the document
|
|
72
|
-
my $pages = $doc->get_objs_ref();
|
|
73
|
-
|
|
74
|
-
# From page, To page
|
|
75
|
-
my $start_page = $line_addrs->[ 0 ]->{ 'L1' };
|
|
76
|
-
my $end_page = $line_addrs->[ -1 ]->{ 'L1' };
|
|
77
|
-
|
|
78
|
-
# Break condition: $line_pos is empty or all lines have been retrieved
|
|
79
|
-
my $break = 0;
|
|
80
|
-
|
|
81
|
-
# Tree traveling is 'not' fun. Seriously.
|
|
82
|
-
# This is like a dungeon seige.
|
|
83
|
-
for (my $x = $start_page; $x <= $end_page; $x++)
|
|
84
|
-
{
|
|
85
|
-
# Column or dd
|
|
86
|
-
my $level_2 = $pages->[ $x ]->get_objs_ref();
|
|
87
|
-
my $start_l2 = ($x == $line_addrs->[ 0 ]->{ 'L1' }) ?
|
|
88
|
-
$line_addrs->[ 0 ]->{ 'L2' } : 0;
|
|
89
|
-
my $end_l2 = ($x == $line_addrs->[ -1 ]->{ 'L1' }) ?
|
|
90
|
-
$line_addrs->[ -1 ]->{ 'L2' } : (scalar(@{ $level_2 }) - 1);
|
|
91
|
-
|
|
92
|
-
for (my $y = $start_l2; $y <= $end_l2; $y++)
|
|
93
|
-
{
|
|
94
|
-
# Table or paragraph
|
|
95
|
-
my $level_3 = $level_2->[ $y ]->get_objs_ref();
|
|
96
|
-
my $start_l3 = (($x == $line_addrs->[ 0 ]->{ 'L1' }) && ($y == $line_addrs->[ 0 ]->{ 'L2' })) ?
|
|
97
|
-
$line_addrs->[ 0 ]->{ 'L3' } : 0;
|
|
98
|
-
my $end_l3 = (($x == $line_addrs->[ -1 ]->{ 'L1' }) && ($y == $line_addrs->[ -1 ]->{ 'L2' })) ?
|
|
99
|
-
$line_addrs->[ -1 ]->{ 'L3' } : (scalar(@{ $level_3 }) - 1);
|
|
100
|
-
|
|
101
|
-
for (my $z = $start_l3; $z <= $end_l3; $z++)
|
|
102
|
-
{
|
|
103
|
-
# Is a paragraph
|
|
104
|
-
if ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIPARA' })
|
|
105
|
-
{
|
|
106
|
-
# Line or cell
|
|
107
|
-
my $level_4 = $level_3->[ $z ]->get_objs_ref();
|
|
108
|
-
my $start_l4 = (($x == $line_addrs->[ 0 ]->{ 'L1' }) && ($y == $line_addrs->[ 0 ]->{ 'L2' }) && ($z == $line_addrs->[ 0 ]->{ 'L3' })) ?
|
|
109
|
-
$line_addrs->[ 0 ]->{ 'L4' } : 0;
|
|
110
|
-
my $end_l4 = (($x == $line_addrs->[ -1 ]->{ 'L1' }) && ($y == $line_addrs->[ -1 ]->{ 'L2' }) && ($z == $line_addrs->[ -1 ]->{ 'L3' })) ?
|
|
111
|
-
$line_addrs->[ -1 ]->{ 'L4' } : (scalar(@{ $level_4 }) - 1);
|
|
112
|
-
|
|
113
|
-
# Lines
|
|
114
|
-
for (my $t = $start_l4; $t <= $end_l4; $t++)
|
|
115
|
-
{
|
|
116
|
-
# Only keep selected line
|
|
117
|
-
if (($x == $line_addrs->[ $addr_index ]{ 'L1' }) &&
|
|
118
|
-
($y == $line_addrs->[ $addr_index ]{ 'L2' }) &&
|
|
119
|
-
($z == $line_addrs->[ $addr_index ]{ 'L3' }) &&
|
|
120
|
-
($t == $line_addrs->[ $addr_index ]{ 'L4' }))
|
|
121
|
-
{
|
|
122
|
-
if ((! defined $need_obj) || ($need_obj == 0))
|
|
123
|
-
{
|
|
124
|
-
push @line_content, $level_4->[ $t ]->get_content();
|
|
125
|
-
}
|
|
126
|
-
else
|
|
127
|
-
{
|
|
128
|
-
push @line_content, $level_4->[ $t ];
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
# Next selected line
|
|
132
|
-
$addr_index++;
|
|
133
|
-
# Last one?
|
|
134
|
-
if ($addr_index == scalar(@{ $line_addrs }))
|
|
135
|
-
{
|
|
136
|
-
$break = 1;
|
|
137
|
-
last;
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
}
|
|
141
|
-
}
|
|
142
|
-
# Is a table
|
|
143
|
-
elsif (($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNITABLE' }) || ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIFRAME' }))
|
|
144
|
-
{
|
|
145
|
-
# TODO: this actually a trick to get it working for now.
|
|
146
|
-
# We care not about the cell inside the table but the content
|
|
147
|
-
# of the table only. So the table is consider a paragraph in
|
|
148
|
-
# which lines are its row
|
|
149
|
-
my @level_4 = split(/\n/, $level_3->[ $z ]->get_content());
|
|
150
|
-
|
|
151
|
-
for (my $t = 0; $t < scalar(@level_4); $t++)
|
|
152
|
-
{
|
|
153
|
-
# Current position
|
|
154
|
-
$current{ 'L4' } = $t;
|
|
155
|
-
|
|
156
|
-
# Only keep selected line
|
|
157
|
-
if (($x == $line_addrs->[ $addr_index ]{ 'L1' }) &&
|
|
158
|
-
($y == $line_addrs->[ $addr_index ]{ 'L2' }) &&
|
|
159
|
-
($z == $line_addrs->[ $addr_index ]{ 'L3' }) &&
|
|
160
|
-
($t == $line_addrs->[ $addr_index ]{ 'L4' }))
|
|
161
|
-
{
|
|
162
|
-
if ((! defined $need_obj) || ($need_obj == 0)) { push @line_content, $level_4[ $t ]; }
|
|
163
|
-
# Next selected line
|
|
164
|
-
$addr_index++;
|
|
165
|
-
# Last one?
|
|
166
|
-
if ($addr_index == scalar(@{ $line_addrs }))
|
|
167
|
-
{
|
|
168
|
-
$break = 1;
|
|
169
|
-
last;
|
|
170
|
-
}
|
|
171
|
-
}
|
|
172
|
-
}
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
# Break or not
|
|
176
|
-
if ($break == 1) { last; }
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
# Break or not
|
|
180
|
-
if ($break == 1) { last; }
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
# Break or not
|
|
184
|
-
if ($break == 1) { last; }
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
return (\@line_content);
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
###
|
|
191
|
-
# Huydhn: travel the Omnidoc at line level
|
|
192
|
-
###
|
|
193
|
-
sub OmniAirline
|
|
194
|
-
{
|
|
195
|
-
# Omnidoc object
|
|
196
|
-
# Starting position
|
|
197
|
-
# Ending position
|
|
198
|
-
# Both positons are hash with following members
|
|
199
|
-
# 'L1' : page
|
|
200
|
-
# 'L2' : collumn or dd or frame
|
|
201
|
-
# 'L3' : table or paragraph or frame
|
|
202
|
-
# 'L4' : line in paragraph or table or frame
|
|
203
|
-
my ($doc, $start, $end) = @_;
|
|
204
|
-
|
|
205
|
-
# All the line
|
|
206
|
-
my @line_pos = ();
|
|
207
|
-
my @line_content = ();
|
|
208
|
-
|
|
209
|
-
# Current position
|
|
210
|
-
my %current = ();
|
|
211
|
-
|
|
212
|
-
# All pages in the document
|
|
213
|
-
my $pages = $doc->get_objs_ref();
|
|
214
|
-
|
|
215
|
-
# From page, To page
|
|
216
|
-
my $start_page = (defined $start) ? $start->{ 'L1' } : 0;
|
|
217
|
-
my $end_page = (defined $end) ? $end->{ 'L1' } : (scalar(@{ $pages }) - 1);
|
|
218
|
-
|
|
219
|
-
# Tree traveling is 'not' fun. Seriously.
|
|
220
|
-
# This is like a dungeon seige.
|
|
221
|
-
for (my $x = $start_page; $x <= $end_page; $x++)
|
|
222
|
-
{
|
|
223
|
-
# Current position
|
|
224
|
-
$current{ 'L1' } = $x;
|
|
225
|
-
|
|
226
|
-
# Column or dd
|
|
227
|
-
my $level_2 = $pages->[ $x ]->get_objs_ref();
|
|
228
|
-
my $start_l2 = ((defined $start) && ($x == $start->{ 'L1' })) ?
|
|
229
|
-
$start->{ 'L2' } : 0;
|
|
230
|
-
my $end_l2 = ((defined $end) && ($x == $end->{ 'L1' })) ?
|
|
231
|
-
$end->{ 'L2' } : (scalar(@{ $level_2 }) - 1);
|
|
232
|
-
|
|
233
|
-
for (my $y = $start_l2; $y <= $end_l2; $y++)
|
|
234
|
-
{
|
|
235
|
-
# Current position
|
|
236
|
-
$current{ 'L2' } = $y;
|
|
237
|
-
|
|
238
|
-
# Table or paragraph
|
|
239
|
-
my $level_3 = $level_2->[ $y ]->get_objs_ref();
|
|
240
|
-
my $start_l3 = ((defined $start) && ($x == $start->{ 'L1' }) && ($y == $start->{ 'L2' })) ?
|
|
241
|
-
$start->{ 'L3' } : 0;
|
|
242
|
-
my $end_l3 = ((defined $end) &&($x == $end->{ 'L1' }) && ($y == $end->{ 'L2' })) ?
|
|
243
|
-
$end->{ 'L3' } : (scalar(@{ $level_3 }) - 1);
|
|
244
|
-
|
|
245
|
-
for (my $z = $start_l3; $z <= $end_l3; $z++)
|
|
246
|
-
{
|
|
247
|
-
# Current position
|
|
248
|
-
$current{ 'L3' } = $z;
|
|
249
|
-
|
|
250
|
-
# Is a paragraph
|
|
251
|
-
if ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIPARA' })
|
|
252
|
-
{
|
|
253
|
-
# Line or cell
|
|
254
|
-
my $level_4 = $level_3->[ $z ]->get_objs_ref();
|
|
255
|
-
my $start_l4 = ((defined $start) && ($x == $start->{ 'L1' }) && ($y == $start->{ 'L2' }) && ($z == $start->{ 'L3' })) ?
|
|
256
|
-
$start->{ 'L4' } : 0;
|
|
257
|
-
my $end_l4 = ((defined $end) && ($x == $end->{ 'L1' }) && ($y == $end->{ 'L2' }) && ($z == $end->{ 'L3' })) ?
|
|
258
|
-
$end->{ 'L4' } : (scalar(@{ $level_4 }) - 1);
|
|
259
|
-
|
|
260
|
-
# Lines
|
|
261
|
-
for (my $t = $start_l4; $t <= $end_l4; $t++)
|
|
262
|
-
{
|
|
263
|
-
# Current position
|
|
264
|
-
$current{ 'L4' } = $t;
|
|
265
|
-
|
|
266
|
-
# Only keep non-empty line
|
|
267
|
-
my $l = $level_4->[ $t ]->get_content();
|
|
268
|
-
$l =~ s/^\s+|\s+$//g;
|
|
269
|
-
|
|
270
|
-
if ($l ne "")
|
|
271
|
-
{
|
|
272
|
-
# Save the current position and the content of the current line
|
|
273
|
-
push @line_pos, { %current };
|
|
274
|
-
push @line_content, $level_4->[ $t ]->get_content();
|
|
275
|
-
}
|
|
276
|
-
}
|
|
277
|
-
}
|
|
278
|
-
# Is a table or frame
|
|
279
|
-
elsif (($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNITABLE' }) || ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIFRAME' }))
|
|
280
|
-
{
|
|
281
|
-
# TODO: this actually a trick to get it working for now.
|
|
282
|
-
# We care not about the cell inside the table but the content
|
|
283
|
-
# of the table only. So the table is consider a paragraph in
|
|
284
|
-
# which lines are its row
|
|
285
|
-
my @level_4 = split(/\n/, $level_3->[ $z ]->get_content());
|
|
286
|
-
|
|
287
|
-
for (my $t = 0; $t <= scalar(@level_4); $t++)
|
|
288
|
-
{
|
|
289
|
-
# Current position
|
|
290
|
-
$current{ 'L4' } = $t;
|
|
291
|
-
|
|
292
|
-
# Only keep non-empty line
|
|
293
|
-
my $l = $level_4[ $t ];
|
|
294
|
-
$l =~ s/^\s+|\s+$//g;
|
|
295
|
-
|
|
296
|
-
if ($l ne "")
|
|
297
|
-
{
|
|
298
|
-
# Save the current position and the content of the current line
|
|
299
|
-
push @line_pos, { %current };
|
|
300
|
-
push @line_content, $level_4[ $t ];
|
|
301
|
-
}
|
|
302
|
-
}
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
}
|
|
307
|
-
}
|
|
308
|
-
}
|
|
309
|
-
|
|
310
|
-
return (\@line_pos, \@line_content);
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
1;
|