biblicit 2.0.3 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +0 -2
- data/biblicit.gemspec +1 -1
- data/parscit/bin/citeExtract.pl +9 -161
- data/parscit/bin/sectExtract.pl +0 -14
- data/parscit/lib/ParsCit/Controller.pm +0 -59
- data/parscit/lib/ParsCit/PreProcess.pm +0 -4
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1 -7
- metadata +4 -24
- data/parscit/bin/sectLabel/processOmniXML.pl +0 -1427
- data/parscit/bin/sectLabel/processOmniXML_new.pl +0 -1025
- data/parscit/bin/sectLabel/processOmniXMLv2.pl +0 -1529
- data/parscit/bin/sectLabel/processOmniXMLv3.pl +0 -964
- data/parscit/bin/sectLabel/simplifyOmniXML.pl +0 -382
- data/parscit/bin/xml2train.pl +0 -193
- data/parscit/lib/Omni/Config.pm +0 -93
- data/parscit/lib/Omni/Omnicell.pm +0 -263
- data/parscit/lib/Omni/Omnicol.pm +0 -292
- data/parscit/lib/Omni/Omnidd.pm +0 -328
- data/parscit/lib/Omni/Omnidoc.pm +0 -153
- data/parscit/lib/Omni/Omniframe.pm +0 -223
- data/parscit/lib/Omni/Omniline.pm +0 -423
- data/parscit/lib/Omni/Omnipage.pm +0 -282
- data/parscit/lib/Omni/Omnipara.pm +0 -232
- data/parscit/lib/Omni/Omnirun.pm +0 -303
- data/parscit/lib/Omni/Omnitable.pm +0 -336
- data/parscit/lib/Omni/Omniword.pm +0 -162
- data/parscit/lib/Omni/Traversal.pm +0 -313
- data/parscit/lib/SectLabel/AAMatching.pm +0 -1949
data/parscit/lib/Omni/Omnidd.pm
DELETED
|
@@ -1,328 +0,0 @@
|
|
|
1
|
-
package Omni::Omnidd;
|
|
2
|
-
|
|
3
|
-
# Configuration
|
|
4
|
-
use strict;
|
|
5
|
-
|
|
6
|
-
# Local libraries
|
|
7
|
-
use Omni::Config;
|
|
8
|
-
use Omni::Omnicol;
|
|
9
|
-
use Omni::Omnipara;
|
|
10
|
-
use Omni::Omniframe;
|
|
11
|
-
use Omni::Omnitable;
|
|
12
|
-
|
|
13
|
-
# Extern libraries
|
|
14
|
-
use XML::Twig;
|
|
15
|
-
use XML::Parser;
|
|
16
|
-
|
|
17
|
-
# Global variables
|
|
18
|
-
my $tag_list = $Omni::Config::tag_list;
|
|
19
|
-
my $att_list = $Omni::Config::att_list;
|
|
20
|
-
my $obj_list = $Omni::Config::obj_list;
|
|
21
|
-
|
|
22
|
-
# Temporary variables
|
|
23
|
-
|
|
24
|
-
###
|
|
25
|
-
# A dd object in Omnipage xml: a dd, don't know what it is, but its structure
|
|
26
|
-
# is quite similar to a column
|
|
27
|
-
#
|
|
28
|
-
# Do Hoang Nhat Huy, 11 Jan 2011
|
|
29
|
-
###
|
|
30
|
-
# Initialization
|
|
31
|
-
sub new
|
|
32
|
-
{
|
|
33
|
-
my ($class) = @_;
|
|
34
|
-
|
|
35
|
-
# dd: a dd can have many tables, or pictures, may be paras, and columns
|
|
36
|
-
my @objs = ();
|
|
37
|
-
|
|
38
|
-
# Class members
|
|
39
|
-
my $self = { '_self' => $obj_list->{ 'OMNIDD' },
|
|
40
|
-
'_raw' => undef,
|
|
41
|
-
'_content' => undef,
|
|
42
|
-
'_bottom' => undef,
|
|
43
|
-
'_top' => undef,
|
|
44
|
-
'_left' => undef,
|
|
45
|
-
'_right' => undef,
|
|
46
|
-
'_bottom_dist' => undef,
|
|
47
|
-
'_top_dist' => undef,
|
|
48
|
-
'_left_dist' => undef,
|
|
49
|
-
'_right_dist' => undef,
|
|
50
|
-
'_objs' => \@objs };
|
|
51
|
-
|
|
52
|
-
bless $self, $class;
|
|
53
|
-
return $self;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
#
|
|
57
|
-
sub set_raw
|
|
58
|
-
{
|
|
59
|
-
my ($self, $raw) = @_;
|
|
60
|
-
|
|
61
|
-
# Save the raw xml <column> ... </column>
|
|
62
|
-
$self->{ '_raw' } = $raw;
|
|
63
|
-
|
|
64
|
-
# Parse the raw string
|
|
65
|
-
my $twig_roots = { $tag_list->{ 'DD' } => 1 };
|
|
66
|
-
my $twig_handlers = { $tag_list->{ 'DD' } => sub { parse(@_, \$self); } };
|
|
67
|
-
|
|
68
|
-
# XML::Twig
|
|
69
|
-
my $twig = new XML::Twig( twig_roots => $twig_roots,
|
|
70
|
-
twig_handlers => $twig_handlers,
|
|
71
|
-
pretty_print => 'indented' );
|
|
72
|
-
|
|
73
|
-
# Start the XML parsing
|
|
74
|
-
$twig->parse($raw, \$self);
|
|
75
|
-
$twig->purge;
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
sub get_raw
|
|
79
|
-
{
|
|
80
|
-
my ($self) = @_;
|
|
81
|
-
return $self->{ '_raw' };
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
sub parse
|
|
85
|
-
{
|
|
86
|
-
my ($twig, $node, $self) = @_;
|
|
87
|
-
|
|
88
|
-
# At first, content is blank
|
|
89
|
-
my $tmp_content = "";
|
|
90
|
-
# because there's no object
|
|
91
|
-
my @tmp_objs = ();
|
|
92
|
-
|
|
93
|
-
# Get <dd> node attributes
|
|
94
|
-
my $tmp_bottom = GetNodeAttr($node, $att_list->{ 'BOTTOM' });
|
|
95
|
-
my $tmp_top = GetNodeAttr($node, $att_list->{ 'TOP' });
|
|
96
|
-
my $tmp_left = GetNodeAttr($node, $att_list->{ 'LEFT' });
|
|
97
|
-
my $tmp_right = GetNodeAttr($node, $att_list->{ 'RIGHT' });
|
|
98
|
-
my $tmp_bottom_dist = GetNodeAttr($node, $att_list->{ 'BOTTOMDIST' });
|
|
99
|
-
my $tmp_top_dist = GetNodeAttr($node, $att_list->{ 'TOPDIST' });
|
|
100
|
-
my $tmp_left_dist = GetNodeAttr($node, $att_list->{ 'LEFTDIST' });
|
|
101
|
-
my $tmp_right_dist = GetNodeAttr($node, $att_list->{ 'RIGHTDIST' });
|
|
102
|
-
|
|
103
|
-
# Check if there's any paragraph, col, table, or picture
|
|
104
|
-
# The large number of possible children is due to the
|
|
105
|
-
# ambiguous structure of the Omnipage XML
|
|
106
|
-
my $dd_tag = $tag_list->{ 'DD' };
|
|
107
|
-
my $img_tag = $tag_list->{ 'PICTURE' };
|
|
108
|
-
my $para_tag = $tag_list->{ 'PARA' };
|
|
109
|
-
my $table_tag = $tag_list->{ 'TABLE' };
|
|
110
|
-
my $column_tag = $tag_list->{ 'COLUMN' };
|
|
111
|
-
my $frame_tag = $tag_list->{ 'FRAME' };
|
|
112
|
-
|
|
113
|
-
my $child = undef;
|
|
114
|
-
# Get the first child in the body text
|
|
115
|
-
$child = $node->first_child();
|
|
116
|
-
|
|
117
|
-
while (defined $child)
|
|
118
|
-
{
|
|
119
|
-
my $xpath = $child->path();
|
|
120
|
-
|
|
121
|
-
# if this child is a <para> tag
|
|
122
|
-
if ($xpath =~ m/\/$para_tag$/)
|
|
123
|
-
{
|
|
124
|
-
my $para = new Omni::Omnipara();
|
|
125
|
-
|
|
126
|
-
# Set raw content
|
|
127
|
-
$para->set_raw($child->sprint());
|
|
128
|
-
|
|
129
|
-
# Update paragraph list
|
|
130
|
-
push @tmp_objs, $para;
|
|
131
|
-
|
|
132
|
-
# Update content
|
|
133
|
-
$tmp_content = $tmp_content . $para->get_content() . "\n";
|
|
134
|
-
}
|
|
135
|
-
# if this child is a <dd> tag
|
|
136
|
-
elsif ($xpath =~ m/\/$dd_tag$/)
|
|
137
|
-
{
|
|
138
|
-
my $dd = new Omni::Omnidd();
|
|
139
|
-
|
|
140
|
-
# Set raw content
|
|
141
|
-
$dd->set_raw($child->sprint());
|
|
142
|
-
|
|
143
|
-
# Nested <dd> is not allowed so we copy the objects
|
|
144
|
-
my $objects = $dd->get_objs_ref();
|
|
145
|
-
|
|
146
|
-
# Update <dd> objects list
|
|
147
|
-
push @tmp_objs, @{ $objects };
|
|
148
|
-
|
|
149
|
-
# Update content
|
|
150
|
-
$tmp_content = $tmp_content . $dd->get_content() . "\n";
|
|
151
|
-
}
|
|
152
|
-
# if this child is a <table> tag
|
|
153
|
-
elsif ($xpath =~ m/\/$table_tag$/)
|
|
154
|
-
{
|
|
155
|
-
my $table = new Omni::Omnitable();
|
|
156
|
-
|
|
157
|
-
# Set raw content
|
|
158
|
-
$table->set_raw($child->sprint());
|
|
159
|
-
|
|
160
|
-
# Update paragraph list
|
|
161
|
-
push @tmp_objs, $table;
|
|
162
|
-
|
|
163
|
-
# Update content
|
|
164
|
-
$tmp_content = $tmp_content . $table->get_content() . "\n";
|
|
165
|
-
}
|
|
166
|
-
# if this child is a <picture> tag
|
|
167
|
-
elsif ($xpath =~ m/\/$img_tag$/)
|
|
168
|
-
{
|
|
169
|
-
#my $img = new Omni::Omniimg();
|
|
170
|
-
|
|
171
|
-
# Set raw content
|
|
172
|
-
#$img->set_raw($child->sprint());
|
|
173
|
-
|
|
174
|
-
# Update paragraph list
|
|
175
|
-
#push @tmp_objs, $img;
|
|
176
|
-
|
|
177
|
-
# Update content
|
|
178
|
-
#$tmp_content = $tmp_content . $img->get_content() . "\n";
|
|
179
|
-
}
|
|
180
|
-
# if this child is a <column> tag
|
|
181
|
-
elsif ($xpath =~ m/\/$column_tag$/)
|
|
182
|
-
{
|
|
183
|
-
#my $col = new Omni::Omnicol();
|
|
184
|
-
|
|
185
|
-
# Set raw content
|
|
186
|
-
#$col->set_raw($child->sprint());
|
|
187
|
-
|
|
188
|
-
# Update paragraph list
|
|
189
|
-
#push @tmp_objs, $col;
|
|
190
|
-
|
|
191
|
-
# Update content
|
|
192
|
-
#$tmp_content = $tmp_content . $col->get_content() . "\n";
|
|
193
|
-
}
|
|
194
|
-
# if this child is <frame>
|
|
195
|
-
elsif ($xpath =~ m/\/$frame_tag$/)
|
|
196
|
-
{
|
|
197
|
-
my $frame = new Omni::Omniframe();
|
|
198
|
-
|
|
199
|
-
# Set raw content
|
|
200
|
-
$frame->set_raw($child->sprint());
|
|
201
|
-
|
|
202
|
-
# Update column list
|
|
203
|
-
push @tmp_objs, $frame;
|
|
204
|
-
|
|
205
|
-
# Update content
|
|
206
|
-
$tmp_content = $tmp_content . $frame->get_content() . "\n";
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
# Little brother
|
|
210
|
-
if ($child->is_last_child)
|
|
211
|
-
{
|
|
212
|
-
last;
|
|
213
|
-
}
|
|
214
|
-
else
|
|
215
|
-
{
|
|
216
|
-
$child = $child->next_sibling();
|
|
217
|
-
}
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
# Copy information from temporary variables to class members
|
|
221
|
-
$$self->{ '_bottom' } = $tmp_bottom;
|
|
222
|
-
$$self->{ '_top' } = $tmp_top;
|
|
223
|
-
$$self->{ '_left' } = $tmp_left;
|
|
224
|
-
$$self->{ '_right' } = $tmp_right;
|
|
225
|
-
$$self->{ '_bottom_dist' } = $tmp_bottom_dist;
|
|
226
|
-
$$self->{ '_top_dist' } = $tmp_top_dist;
|
|
227
|
-
$$self->{ '_left_dist' } = $tmp_left_dist;
|
|
228
|
-
$$self->{ '_right_dist' } = $tmp_right_dist;
|
|
229
|
-
|
|
230
|
-
# Copy content
|
|
231
|
-
$$self->{ '_content' } = $tmp_content;
|
|
232
|
-
|
|
233
|
-
# Copy all objects
|
|
234
|
-
@{$$self->{ '_objs' } } = @tmp_objs;
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
sub get_name
|
|
238
|
-
{
|
|
239
|
-
my ($self) = @_;
|
|
240
|
-
return $self->{ '_self' };
|
|
241
|
-
}
|
|
242
|
-
|
|
243
|
-
sub get_objs_ref
|
|
244
|
-
{
|
|
245
|
-
my ($self) = @_;
|
|
246
|
-
return $self->{ '_objs' };
|
|
247
|
-
}
|
|
248
|
-
|
|
249
|
-
sub get_content
|
|
250
|
-
{
|
|
251
|
-
my ($self) = @_;
|
|
252
|
-
return $self->{ '_content' };
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
sub get_bottom_pos
|
|
256
|
-
{
|
|
257
|
-
my ($self) = @_;
|
|
258
|
-
return $self->{ '_bottom' };
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
sub get_top_pos
|
|
262
|
-
{
|
|
263
|
-
my ($self) = @_;
|
|
264
|
-
return $self->{ '_top' };
|
|
265
|
-
}
|
|
266
|
-
|
|
267
|
-
sub get_left_pos
|
|
268
|
-
{
|
|
269
|
-
my ($self) = @_;
|
|
270
|
-
return $self->{ '_left' };
|
|
271
|
-
}
|
|
272
|
-
|
|
273
|
-
sub get_right_pos
|
|
274
|
-
{
|
|
275
|
-
my ($self) = @_;
|
|
276
|
-
return $self->{ '_right' };
|
|
277
|
-
}
|
|
278
|
-
|
|
279
|
-
sub get_bottom_distance
|
|
280
|
-
{
|
|
281
|
-
my ($self) = @_;
|
|
282
|
-
return $self->{ '_bottom_dist' };
|
|
283
|
-
}
|
|
284
|
-
|
|
285
|
-
sub get_top_distance
|
|
286
|
-
{
|
|
287
|
-
my ($self) = @_;
|
|
288
|
-
return $self->{ '_top_dist' };
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
sub get_left_distance
|
|
292
|
-
{
|
|
293
|
-
my ($self) = @_;
|
|
294
|
-
return $self->{ '_left_dist' };
|
|
295
|
-
}
|
|
296
|
-
|
|
297
|
-
sub get_right_distance
|
|
298
|
-
{
|
|
299
|
-
my ($self) = @_;
|
|
300
|
-
return $self->{ '_right_dist' };
|
|
301
|
-
}
|
|
302
|
-
|
|
303
|
-
# Support functions
|
|
304
|
-
sub GetNodeAttr
|
|
305
|
-
{
|
|
306
|
-
my ($node, $attr) = @_;
|
|
307
|
-
return ($node->att($attr) ? $node->att($attr) : "");
|
|
308
|
-
}
|
|
309
|
-
|
|
310
|
-
sub SetNodeAttr
|
|
311
|
-
{
|
|
312
|
-
my ($node, $attr, $value) = @_;
|
|
313
|
-
$node->set_att($attr, $value);
|
|
314
|
-
}
|
|
315
|
-
|
|
316
|
-
sub GetNodeText
|
|
317
|
-
{
|
|
318
|
-
my ($node) = @_;
|
|
319
|
-
return $node->text;
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
sub SetNodeText
|
|
323
|
-
{
|
|
324
|
-
my ($node, $value) = @_;
|
|
325
|
-
$node->set_text($value);
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
1;
|
data/parscit/lib/Omni/Omnidoc.pm
DELETED
|
@@ -1,153 +0,0 @@
|
|
|
1
|
-
package Omni::Omnidoc;
|
|
2
|
-
|
|
3
|
-
# Configuration
|
|
4
|
-
use strict;
|
|
5
|
-
|
|
6
|
-
# Local libraries
|
|
7
|
-
use Omni::Config;
|
|
8
|
-
use Omni::Omnipage;
|
|
9
|
-
|
|
10
|
-
# Extern libraries
|
|
11
|
-
use XML::Twig;
|
|
12
|
-
use XML::Parser;
|
|
13
|
-
|
|
14
|
-
# Global variables
|
|
15
|
-
my $tag_list = $Omni::Config::tag_list;
|
|
16
|
-
my $att_list = $Omni::Config::att_list;
|
|
17
|
-
my $obj_list = $Omni::Config::obj_list;
|
|
18
|
-
|
|
19
|
-
# Temporary variables
|
|
20
|
-
my $tmp_content = undef;
|
|
21
|
-
my @tmp_pages = ();
|
|
22
|
-
|
|
23
|
-
###
|
|
24
|
-
# A whole document object in Omnipage xml: a document contains many pages
|
|
25
|
-
#
|
|
26
|
-
# Do Hoang Nhat Huy, 09 Jan 2011
|
|
27
|
-
###
|
|
28
|
-
# Initialization
|
|
29
|
-
sub new
|
|
30
|
-
{
|
|
31
|
-
my ($class) = @_;
|
|
32
|
-
|
|
33
|
-
# Lines: a paragraph can have multiple lines
|
|
34
|
-
my @pages = ();
|
|
35
|
-
|
|
36
|
-
# Class members
|
|
37
|
-
my $self = { '_self' => $obj_list->{ 'OMNIDOC' },
|
|
38
|
-
'_raw' => undef,
|
|
39
|
-
'_content' => undef,
|
|
40
|
-
'_pages' => \@pages };
|
|
41
|
-
|
|
42
|
-
bless $self, $class;
|
|
43
|
-
return $self;
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
#
|
|
47
|
-
sub set_raw
|
|
48
|
-
{
|
|
49
|
-
my ($self, $raw) = @_;
|
|
50
|
-
|
|
51
|
-
# Save the raw xml <para> ... </para>
|
|
52
|
-
$self->{ '_raw' } = $raw;
|
|
53
|
-
|
|
54
|
-
# At first, content is blank
|
|
55
|
-
$tmp_content = "";
|
|
56
|
-
# because there's no document
|
|
57
|
-
@tmp_pages = ();
|
|
58
|
-
|
|
59
|
-
# Parse the raw string
|
|
60
|
-
my $twig_roots = { $tag_list->{ 'DOCUMENT' } => 1 };
|
|
61
|
-
my $twig_handlers = { $tag_list->{ 'DOCUMENT' } => \&parse};
|
|
62
|
-
|
|
63
|
-
# XML::Twig
|
|
64
|
-
my $twig = new XML::Twig( twig_roots => $twig_roots,
|
|
65
|
-
twig_handlers => $twig_handlers,
|
|
66
|
-
pretty_print => 'indented' );
|
|
67
|
-
|
|
68
|
-
# Start the XML parsing
|
|
69
|
-
$twig->parse($raw);
|
|
70
|
-
$twig->purge;
|
|
71
|
-
|
|
72
|
-
# Copy information from temporary variables to class members
|
|
73
|
-
|
|
74
|
-
# Copy all pages
|
|
75
|
-
@{$self->{ '_pages' } } = @tmp_pages;
|
|
76
|
-
|
|
77
|
-
# Copy content
|
|
78
|
-
$self->{ '_content' } = $tmp_content;
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
sub get_raw
|
|
82
|
-
{
|
|
83
|
-
my ($self) = @_;
|
|
84
|
-
return $self->{ '_raw' };
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
sub parse
|
|
88
|
-
{
|
|
89
|
-
my ($twig, $node) = @_;
|
|
90
|
-
|
|
91
|
-
# Get <document> node attributes
|
|
92
|
-
|
|
93
|
-
# Check if there's any para
|
|
94
|
-
my @all_pages = $node->descendants( $tag_list->{ 'PAGE' } );
|
|
95
|
-
foreach my $pg (@all_pages)
|
|
96
|
-
{
|
|
97
|
-
my $page = new Omni::Omnipage();
|
|
98
|
-
|
|
99
|
-
# Set raw content
|
|
100
|
-
$page->set_raw($pg->sprint());
|
|
101
|
-
|
|
102
|
-
# Update page list
|
|
103
|
-
push @tmp_pages, $page;
|
|
104
|
-
|
|
105
|
-
# Update content
|
|
106
|
-
$tmp_content = $tmp_content . $page->get_content() . "\n";
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
sub get_name
|
|
111
|
-
{
|
|
112
|
-
my ($self) = @_;
|
|
113
|
-
return $self->{ '_self' };
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
sub get_objs_ref
|
|
117
|
-
{
|
|
118
|
-
my ($self) = @_;
|
|
119
|
-
return $self->{ '_pages' };
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
sub get_content
|
|
123
|
-
{
|
|
124
|
-
my ($self) = @_;
|
|
125
|
-
return $self->{ '_content' };
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
# Support functions
|
|
129
|
-
sub GetNodeAttr
|
|
130
|
-
{
|
|
131
|
-
my ($node, $attr) = @_;
|
|
132
|
-
return ($node->att($attr) ? $node->att($attr) : "");
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
sub SetNodeAttr
|
|
136
|
-
{
|
|
137
|
-
my ($node, $attr, $value) = @_;
|
|
138
|
-
$node->set_att($attr, $value);
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
sub GetNodeText
|
|
142
|
-
{
|
|
143
|
-
my ($node) = @_;
|
|
144
|
-
return $node->text;
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
sub SetNodeText
|
|
148
|
-
{
|
|
149
|
-
my ($node, $value) = @_;
|
|
150
|
-
$node->set_text($value);
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
1;
|
|
@@ -1,223 +0,0 @@
|
|
|
1
|
-
package Omni::Omniframe;
|
|
2
|
-
|
|
3
|
-
# Configuration
|
|
4
|
-
use strict;
|
|
5
|
-
|
|
6
|
-
# Local libraries
|
|
7
|
-
use Omni::Config;
|
|
8
|
-
use Omni::Omnipara;
|
|
9
|
-
use Omni::Omnitable;
|
|
10
|
-
|
|
11
|
-
# Extern libraries
|
|
12
|
-
use XML::Twig;
|
|
13
|
-
use XML::Parser;
|
|
14
|
-
|
|
15
|
-
# Global variables
|
|
16
|
-
my $tag_list = $Omni::Config::tag_list;
|
|
17
|
-
my $att_list = $Omni::Config::att_list;
|
|
18
|
-
my $obj_list = $Omni::Config::obj_list;
|
|
19
|
-
|
|
20
|
-
###
|
|
21
|
-
# A frame object in Omnipage xml: a frame contains paragraphs
|
|
22
|
-
# (this is my observation and can be invalid under close scrunity
|
|
23
|
-
# of new "evidence")
|
|
24
|
-
#
|
|
25
|
-
# Do Hoang Nhat Huy, 23 Feb 2011
|
|
26
|
-
###
|
|
27
|
-
# Initialization
|
|
28
|
-
sub new
|
|
29
|
-
{
|
|
30
|
-
my ($class) = @_;
|
|
31
|
-
|
|
32
|
-
# Objs: paragraphs
|
|
33
|
-
my @objs = ();
|
|
34
|
-
|
|
35
|
-
# Class members
|
|
36
|
-
my $self = { '_self' => $obj_list->{ 'OMNIFRAME' },
|
|
37
|
-
'_raw' => undef,
|
|
38
|
-
'_content' => undef,
|
|
39
|
-
'_bottom' => undef,
|
|
40
|
-
'_top' => undef,
|
|
41
|
-
'_left' => undef,
|
|
42
|
-
'_right' => undef,
|
|
43
|
-
'_objs' => \@objs };
|
|
44
|
-
|
|
45
|
-
bless $self, $class;
|
|
46
|
-
return $self;
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
sub set_raw
|
|
50
|
-
{
|
|
51
|
-
my ($self, $raw) = @_;
|
|
52
|
-
|
|
53
|
-
# Save the raw xml <column> ... </column>
|
|
54
|
-
$self->{ '_raw' } = $raw;
|
|
55
|
-
|
|
56
|
-
# Parse the raw string
|
|
57
|
-
my $twig_roots = { $tag_list->{ 'FRAME' } => 1 };
|
|
58
|
-
my $twig_handlers = { $tag_list->{ 'FRAME' } => sub { parse(@_, \$self); } };
|
|
59
|
-
|
|
60
|
-
# XML::Twig
|
|
61
|
-
my $twig = new XML::Twig( twig_roots => $twig_roots,
|
|
62
|
-
twig_handlers => $twig_handlers,
|
|
63
|
-
pretty_print => 'indented' );
|
|
64
|
-
|
|
65
|
-
# Start the XML parsing
|
|
66
|
-
$twig->parse($raw, \$self);
|
|
67
|
-
$twig->purge;
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
sub get_raw
|
|
71
|
-
{
|
|
72
|
-
my ($self) = @_;
|
|
73
|
-
return $self->{ '_raw' };
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
sub parse
|
|
77
|
-
{
|
|
78
|
-
my ($twig, $node, $self) = @_;
|
|
79
|
-
|
|
80
|
-
# At first, content is blank
|
|
81
|
-
my $tmp_content = "";
|
|
82
|
-
# because there's no object
|
|
83
|
-
my @tmp_objs = ();
|
|
84
|
-
|
|
85
|
-
# Get <column> node attributes
|
|
86
|
-
my $tmp_bottom = GetNodeAttr($node, $att_list->{ 'BOTTOM' });
|
|
87
|
-
my $tmp_top = GetNodeAttr($node, $att_list->{ 'TOP' });
|
|
88
|
-
my $tmp_left = GetNodeAttr($node, $att_list->{ 'LEFT' });
|
|
89
|
-
my $tmp_right = GetNodeAttr($node, $att_list->{ 'RIGHT' });
|
|
90
|
-
|
|
91
|
-
# Check if there's any paragraph, dd, table, or picture
|
|
92
|
-
# The large number of possible children is due to the
|
|
93
|
-
# ambiguous structure of the Omnipage XML
|
|
94
|
-
my $para_tag = $tag_list->{ 'PARA' };
|
|
95
|
-
my $table_tag = $tag_list->{ 'TABLE' };
|
|
96
|
-
|
|
97
|
-
# Get the first child in the body text
|
|
98
|
-
my $child = $node->first_child();
|
|
99
|
-
|
|
100
|
-
while (defined $child)
|
|
101
|
-
{
|
|
102
|
-
my $xpath = $child->path();
|
|
103
|
-
|
|
104
|
-
# if this child is <para>
|
|
105
|
-
if ($xpath =~ m/\/$para_tag$/)
|
|
106
|
-
{
|
|
107
|
-
my $para = new Omni::Omnipara();
|
|
108
|
-
|
|
109
|
-
# Set raw content
|
|
110
|
-
$para->set_raw($child->sprint());
|
|
111
|
-
|
|
112
|
-
# Update paragraph list
|
|
113
|
-
push @tmp_objs, $para;
|
|
114
|
-
|
|
115
|
-
# Update content
|
|
116
|
-
$tmp_content = $tmp_content . $para->get_content() . "\n";
|
|
117
|
-
}
|
|
118
|
-
elsif ($xpath =~ m/\/$table_tag$/)
|
|
119
|
-
{
|
|
120
|
-
my $table = new Omni::Omnitable();
|
|
121
|
-
|
|
122
|
-
# Set raw content
|
|
123
|
-
$table->set_raw($child->sprint());
|
|
124
|
-
|
|
125
|
-
# Update paragraph list
|
|
126
|
-
push @tmp_objs, $table;
|
|
127
|
-
|
|
128
|
-
# Update content
|
|
129
|
-
$tmp_content = $tmp_content . $table->get_content() . "\n";
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
# Little brother
|
|
133
|
-
if ($child->is_last_child)
|
|
134
|
-
{
|
|
135
|
-
last;
|
|
136
|
-
}
|
|
137
|
-
else
|
|
138
|
-
{
|
|
139
|
-
$child = $child->next_sibling();
|
|
140
|
-
}
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
# Copy information from temporary variables to class members
|
|
144
|
-
$$self->{ '_bottom' } = $tmp_bottom;
|
|
145
|
-
$$self->{ '_top' } = $tmp_top;
|
|
146
|
-
$$self->{ '_left' } = $tmp_left;
|
|
147
|
-
$$self->{ '_right' } = $tmp_right;
|
|
148
|
-
|
|
149
|
-
# Copy all objects
|
|
150
|
-
@{$$self->{ '_objs' } } = @tmp_objs;
|
|
151
|
-
|
|
152
|
-
# Copy content
|
|
153
|
-
$$self->{ '_content' } = $tmp_content;
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
sub get_name
|
|
157
|
-
{
|
|
158
|
-
my ($self) = @_;
|
|
159
|
-
return $self->{ '_self' };
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
sub get_objs_ref
|
|
163
|
-
{
|
|
164
|
-
my ($self) = @_;
|
|
165
|
-
return $self->{ '_objs' };
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
sub get_content
|
|
169
|
-
{
|
|
170
|
-
my ($self) = @_;
|
|
171
|
-
return $self->{ '_content' };
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
sub get_bottom_pos
|
|
175
|
-
{
|
|
176
|
-
my ($self) = @_;
|
|
177
|
-
return $self->{ '_bottom' };
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
sub get_top_pos
|
|
181
|
-
{
|
|
182
|
-
my ($self) = @_;
|
|
183
|
-
return $self->{ '_top' };
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
sub get_left_pos
|
|
187
|
-
{
|
|
188
|
-
my ($self) = @_;
|
|
189
|
-
return $self->{ '_left' };
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
sub get_right_pos
|
|
193
|
-
{
|
|
194
|
-
my ($self) = @_;
|
|
195
|
-
return $self->{ '_right' };
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
# Support functions
|
|
199
|
-
sub GetNodeAttr
|
|
200
|
-
{
|
|
201
|
-
my ($node, $attr) = @_;
|
|
202
|
-
return ($node->att($attr) ? $node->att($attr) : "");
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
sub SetNodeAttr
|
|
206
|
-
{
|
|
207
|
-
my ($node, $attr, $value) = @_;
|
|
208
|
-
$node->set_att($attr, $value);
|
|
209
|
-
}
|
|
210
|
-
|
|
211
|
-
sub GetNodeText
|
|
212
|
-
{
|
|
213
|
-
my ($node) = @_;
|
|
214
|
-
return $node->text;
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
sub SetNodeText
|
|
218
|
-
{
|
|
219
|
-
my ($node, $value) = @_;
|
|
220
|
-
$node->set_text($value);
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
1;
|