biblicit 2.0.3 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +0 -2
- data/biblicit.gemspec +1 -1
- data/parscit/bin/citeExtract.pl +9 -161
- data/parscit/bin/sectExtract.pl +0 -14
- data/parscit/lib/ParsCit/Controller.pm +0 -59
- data/parscit/lib/ParsCit/PreProcess.pm +0 -4
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1 -7
- metadata +4 -24
- data/parscit/bin/sectLabel/processOmniXML.pl +0 -1427
- data/parscit/bin/sectLabel/processOmniXML_new.pl +0 -1025
- data/parscit/bin/sectLabel/processOmniXMLv2.pl +0 -1529
- data/parscit/bin/sectLabel/processOmniXMLv3.pl +0 -964
- data/parscit/bin/sectLabel/simplifyOmniXML.pl +0 -382
- data/parscit/bin/xml2train.pl +0 -193
- data/parscit/lib/Omni/Config.pm +0 -93
- data/parscit/lib/Omni/Omnicell.pm +0 -263
- data/parscit/lib/Omni/Omnicol.pm +0 -292
- data/parscit/lib/Omni/Omnidd.pm +0 -328
- data/parscit/lib/Omni/Omnidoc.pm +0 -153
- data/parscit/lib/Omni/Omniframe.pm +0 -223
- data/parscit/lib/Omni/Omniline.pm +0 -423
- data/parscit/lib/Omni/Omnipage.pm +0 -282
- data/parscit/lib/Omni/Omnipara.pm +0 -232
- data/parscit/lib/Omni/Omnirun.pm +0 -303
- data/parscit/lib/Omni/Omnitable.pm +0 -336
- data/parscit/lib/Omni/Omniword.pm +0 -162
- data/parscit/lib/Omni/Traversal.pm +0 -313
- data/parscit/lib/SectLabel/AAMatching.pm +0 -1949
|
@@ -1,423 +0,0 @@
|
|
|
1
|
-
package Omni::Omniline;
|
|
2
|
-
|
|
3
|
-
# Configuration
|
|
4
|
-
use strict;
|
|
5
|
-
|
|
6
|
-
# Local libraries
|
|
7
|
-
use Omni::Config;
|
|
8
|
-
use Omni::Omniword;
|
|
9
|
-
use Omni::Omnirun;
|
|
10
|
-
|
|
11
|
-
# Extern libraries
|
|
12
|
-
use XML::Twig;
|
|
13
|
-
use XML::Parser;
|
|
14
|
-
use XML::Writer;
|
|
15
|
-
use XML::Writer::String;
|
|
16
|
-
|
|
17
|
-
# Global variables
|
|
18
|
-
my $tag_list = $Omni::Config::tag_list;
|
|
19
|
-
my $att_list = $Omni::Config::att_list;
|
|
20
|
-
my $obj_list = $Omni::Config::obj_list;
|
|
21
|
-
|
|
22
|
-
# Temporary variables
|
|
23
|
-
my $tmp_content = undef;
|
|
24
|
-
my $tmp_baseline = undef;
|
|
25
|
-
my $tmp_bottom = undef;
|
|
26
|
-
my $tmp_top = undef;
|
|
27
|
-
my $tmp_left = undef;
|
|
28
|
-
my $tmp_right = undef;
|
|
29
|
-
my @tmp_objs = ();
|
|
30
|
-
|
|
31
|
-
###
|
|
32
|
-
# A line object in Omnipage xml: a line can contain one or many runs
|
|
33
|
-
#
|
|
34
|
-
# Do Hoang Nhat Huy, 09 Jan 2011
|
|
35
|
-
###
|
|
36
|
-
# Initialization
|
|
37
|
-
sub new
|
|
38
|
-
{
|
|
39
|
-
my ($class) = @_;
|
|
40
|
-
|
|
41
|
-
# A line can have multiple runs or words
|
|
42
|
-
my @objs = ();
|
|
43
|
-
|
|
44
|
-
# Class members
|
|
45
|
-
my $self = { '_self' => $obj_list->{ 'OMNILINE' },
|
|
46
|
-
'_raw' => undef,
|
|
47
|
-
'_content' => undef,
|
|
48
|
-
'_baseline' => undef,
|
|
49
|
-
'_bottom' => undef,
|
|
50
|
-
'_top' => undef,
|
|
51
|
-
'_left' => undef,
|
|
52
|
-
'_right' => undef,
|
|
53
|
-
'_bullet' => undef,
|
|
54
|
-
'_objs' => \@objs };
|
|
55
|
-
|
|
56
|
-
bless $self, $class;
|
|
57
|
-
return $self;
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
#
|
|
61
|
-
sub set_raw
|
|
62
|
-
{
|
|
63
|
-
my ($self, $raw) = @_;
|
|
64
|
-
|
|
65
|
-
# Save the raw xml <ln> ... </ln>
|
|
66
|
-
$self->{ '_raw' } = $raw;
|
|
67
|
-
|
|
68
|
-
# Parse the raw string
|
|
69
|
-
my $twig_roots = { $tag_list->{ 'LINE' } => 1 };
|
|
70
|
-
my $twig_handlers = { $tag_list->{ 'LINE' } => \&parse};
|
|
71
|
-
|
|
72
|
-
# XML::Twig
|
|
73
|
-
my $twig = new XML::Twig( twig_roots => $twig_roots,
|
|
74
|
-
twig_handlers => $twig_handlers,
|
|
75
|
-
pretty_print => 'indented' );
|
|
76
|
-
|
|
77
|
-
# Start the XML parsing
|
|
78
|
-
$twig->parse($raw);
|
|
79
|
-
$twig->purge;
|
|
80
|
-
|
|
81
|
-
# Copy information from temporary variables to class members
|
|
82
|
-
$self->{ '_baseline' } = $tmp_baseline;
|
|
83
|
-
$self->{ '_bottom' } = $tmp_bottom;
|
|
84
|
-
$self->{ '_top' } = $tmp_top;
|
|
85
|
-
$self->{ '_left' } = $tmp_left;
|
|
86
|
-
$self->{ '_right' } = $tmp_right;
|
|
87
|
-
|
|
88
|
-
# Copy all objects
|
|
89
|
-
@{ $self->{ '_objs' } } = @tmp_objs;
|
|
90
|
-
|
|
91
|
-
# Copy content
|
|
92
|
-
$self->{ '_content' } = $tmp_content;
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
sub get_raw
|
|
96
|
-
{
|
|
97
|
-
my ($self) = @_;
|
|
98
|
-
return $self->{ '_raw' };
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
sub parse
|
|
102
|
-
{
|
|
103
|
-
my ($twig, $node) = @_;
|
|
104
|
-
|
|
105
|
-
# At first, content is blank
|
|
106
|
-
$tmp_content = "";
|
|
107
|
-
# because there's no run
|
|
108
|
-
@tmp_objs = ();
|
|
109
|
-
|
|
110
|
-
# Get <line> node attributes
|
|
111
|
-
$tmp_bottom = GetNodeAttr($node, $att_list->{ 'BOTTOM' });
|
|
112
|
-
$tmp_top = GetNodeAttr($node, $att_list->{ 'TOP' });
|
|
113
|
-
$tmp_left = GetNodeAttr($node, $att_list->{ 'LEFT' });
|
|
114
|
-
$tmp_right = GetNodeAttr($node, $att_list->{ 'RIGHT' });
|
|
115
|
-
$tmp_baseline = GetNodeAttr($node, $att_list->{ 'BASELINE' });
|
|
116
|
-
|
|
117
|
-
# Get <line> node possible attributes
|
|
118
|
-
my $tmp_font_face = GetNodeAttr($node, $att_list->{ 'FONTFACE' });
|
|
119
|
-
my $tmp_font_family = GetNodeAttr($node, $att_list->{ 'FONTFAMILY' });
|
|
120
|
-
my $tmp_font_pitch = GetNodeAttr($node, $att_list->{ 'FONTPITCH' });
|
|
121
|
-
my $tmp_font_size = GetNodeAttr($node, $att_list->{ 'FONTSIZE' });
|
|
122
|
-
my $tmp_spacing = GetNodeAttr($node, $att_list->{ 'SPACING' });
|
|
123
|
-
my $tmp_su_script = GetNodeAttr($node, $att_list->{ 'SUSCRIPT' }); # sub-script or super-script
|
|
124
|
-
my $tmp_underline = GetNodeAttr($node, $att_list->{ 'UNDERLINE' });
|
|
125
|
-
my $tmp_bold = GetNodeAttr($node, $att_list->{ 'BOLD' });
|
|
126
|
-
my $tmp_italic = GetNodeAttr($node, $att_list->{ 'ITALIC' });
|
|
127
|
-
|
|
128
|
-
# Check if there's any run
|
|
129
|
-
my @all_runs = $node->descendants( $tag_list->{ 'RUN' });
|
|
130
|
-
# There is not
|
|
131
|
-
if (scalar(@all_runs) == 0)
|
|
132
|
-
{
|
|
133
|
-
my $output = XML::Writer::String->new();
|
|
134
|
-
my $writer = new XML::Writer(OUTPUT => $output, UNSAFE => 'true');
|
|
135
|
-
|
|
136
|
-
# Form the fake <run>
|
|
137
|
-
$writer->startTag( "run",
|
|
138
|
-
$att_list->{ 'FONTFACE' } => $tmp_font_face,
|
|
139
|
-
$att_list->{ 'FONTFAMILY' } => $tmp_font_family,
|
|
140
|
-
$att_list->{ 'FONTPITCH' } => $tmp_font_pitch,
|
|
141
|
-
$att_list->{ 'FONTSIZE' } => $tmp_font_size,
|
|
142
|
-
$att_list->{ 'SPACING' } => $tmp_spacing,
|
|
143
|
-
$att_list->{ 'SUSCRIPT' } => $tmp_su_script,
|
|
144
|
-
$att_list->{ 'UNDERLINE' } => $tmp_underline,
|
|
145
|
-
$att_list->{ 'BOLD' } => $tmp_bold,
|
|
146
|
-
$att_list->{ 'ITALIC' } => $tmp_italic );
|
|
147
|
-
|
|
148
|
-
# Get the inner line content
|
|
149
|
-
$writer->raw( $node->xml_string() );
|
|
150
|
-
$writer->endTag("run");
|
|
151
|
-
$writer->end();
|
|
152
|
-
|
|
153
|
-
# Fake run
|
|
154
|
-
my $run = new Omni::Omnirun();
|
|
155
|
-
|
|
156
|
-
# Set raw content
|
|
157
|
-
$run->set_raw($output->value());
|
|
158
|
-
|
|
159
|
-
# Update run list
|
|
160
|
-
push @tmp_objs, $run;
|
|
161
|
-
|
|
162
|
-
# Update content
|
|
163
|
-
$tmp_content = $tmp_content . $run->get_content();
|
|
164
|
-
}
|
|
165
|
-
else
|
|
166
|
-
{
|
|
167
|
-
# Get the first child in the line
|
|
168
|
-
my $child = $node->first_child();
|
|
169
|
-
|
|
170
|
-
# Some type of child
|
|
171
|
-
my $space_tag = $tag_list->{ 'SPACE' };
|
|
172
|
-
my $tab_tag = $tag_list->{ 'TAB' };
|
|
173
|
-
my $newline_tag = $tag_list->{ 'NEWLINE' };
|
|
174
|
-
my $word_tag = $tag_list->{ 'WORD' };
|
|
175
|
-
my $run_tag = $tag_list->{ 'RUN' };
|
|
176
|
-
|
|
177
|
-
# A damn line can contain both <run> and <wd>
|
|
178
|
-
while (defined $child)
|
|
179
|
-
{
|
|
180
|
-
my $xpath = $child->path();
|
|
181
|
-
|
|
182
|
-
# if this child is <run>
|
|
183
|
-
if ($xpath =~ m/\/$run_tag$/)
|
|
184
|
-
{
|
|
185
|
-
my $run = new Omni::Omnirun();
|
|
186
|
-
|
|
187
|
-
# Set raw content
|
|
188
|
-
$run->set_raw($child->sprint());
|
|
189
|
-
|
|
190
|
-
# Update object list
|
|
191
|
-
push @tmp_objs, $run;
|
|
192
|
-
|
|
193
|
-
# Update content
|
|
194
|
-
$tmp_content = $tmp_content . $run->get_content();
|
|
195
|
-
}
|
|
196
|
-
# if this child is <wd>
|
|
197
|
-
elsif ($xpath =~ m/\/$word_tag$/)
|
|
198
|
-
{
|
|
199
|
-
# One word can contain many <run>
|
|
200
|
-
my $grand_child = $child->first_child( $tag_list->{ 'RUN' } );
|
|
201
|
-
|
|
202
|
-
# The first <run> is a special child
|
|
203
|
-
if (defined $grand_child)
|
|
204
|
-
{
|
|
205
|
-
while (defined $grand_child)
|
|
206
|
-
{
|
|
207
|
-
# NOTE: The following code is controvery. Consider the case when a word
|
|
208
|
-
# contain two runs, which means that this word has two parts in two different
|
|
209
|
-
# format.
|
|
210
|
-
#
|
|
211
|
-
# If I want to keep all these format info, I need to consider that this word
|
|
212
|
-
# is actually two words with no space between them
|
|
213
|
-
#
|
|
214
|
-
# But for compatible with Thang's code, there must be only one word here and
|
|
215
|
-
# subsequently only one run. So I only keep the first run
|
|
216
|
-
|
|
217
|
-
# NOTE: The first <run> in <wd> will has the word position information
|
|
218
|
-
my $output = XML::Writer::String->new();
|
|
219
|
-
my $writer = new XML::Writer(OUTPUT => $output, UNSAFE => 'true');
|
|
220
|
-
|
|
221
|
-
# Form the fake <run>
|
|
222
|
-
$writer->startTag( "run",
|
|
223
|
-
$att_list->{ 'FONTFACE' } => GetNodeAttr($grand_child, $att_list->{ 'FONTFACE' }),
|
|
224
|
-
$att_list->{ 'FONTFAMILY' } => GetNodeAttr($grand_child, $att_list->{ 'FONTFAMILY' }),
|
|
225
|
-
$att_list->{ 'FONTPITCH' } => GetNodeAttr($grand_child, $att_list->{ 'FONTPITCH' }),
|
|
226
|
-
$att_list->{ 'FONTSIZE' } => GetNodeAttr($grand_child, $att_list->{ 'FONTSIZE' }),
|
|
227
|
-
$att_list->{ 'SPACING' } => GetNodeAttr($grand_child, $att_list->{ 'SPACING' }),
|
|
228
|
-
$att_list->{ 'SUSCRIPT' } => GetNodeAttr($grand_child, $att_list->{ 'SUSCRIPT' }),
|
|
229
|
-
$att_list->{ 'UNDERLINE' } => GetNodeAttr($grand_child, $att_list->{ 'UNDERLINE' }),
|
|
230
|
-
$att_list->{ 'BOLD' } => GetNodeAttr($grand_child, $att_list->{ 'BOLD' }),
|
|
231
|
-
$att_list->{ 'ITALIC' } => GetNodeAttr($grand_child, $att_list->{ 'ITALIC' }) );
|
|
232
|
-
# Form the fake <wd>
|
|
233
|
-
$writer->startTag( "wd",
|
|
234
|
-
$att_list->{ 'BOTTOM' } => GetNodeAttr($child, $att_list->{ 'BOTTOM' }),
|
|
235
|
-
$att_list->{ 'TOP' } => GetNodeAttr($child, $att_list->{ 'TOP' }),
|
|
236
|
-
$att_list->{ 'LEFT' } => GetNodeAttr($child, $att_list->{ 'LEFT' }),
|
|
237
|
-
$att_list->{ 'RIGHT' } => GetNodeAttr($child, $att_list->{ 'RIGHT' }) );
|
|
238
|
-
|
|
239
|
-
$writer->raw( $grand_child->xml_string() );
|
|
240
|
-
$writer->endTag("wd");
|
|
241
|
-
$writer->endTag("run");
|
|
242
|
-
$writer->end();
|
|
243
|
-
|
|
244
|
-
# Fake run
|
|
245
|
-
my $run = new Omni::Omnirun();
|
|
246
|
-
|
|
247
|
-
# Set raw content
|
|
248
|
-
$run->set_raw($output->value());
|
|
249
|
-
|
|
250
|
-
# Update run list
|
|
251
|
-
push @tmp_objs, $run;
|
|
252
|
-
|
|
253
|
-
# Update content
|
|
254
|
-
$tmp_content = $tmp_content . $run->get_content();
|
|
255
|
-
|
|
256
|
-
# Little brother
|
|
257
|
-
if ($grand_child->is_last_child)
|
|
258
|
-
{
|
|
259
|
-
last;
|
|
260
|
-
}
|
|
261
|
-
else
|
|
262
|
-
{
|
|
263
|
-
$grand_child = $grand_child->next_sibling( $tag_list->{ 'RUN' } );
|
|
264
|
-
}
|
|
265
|
-
}
|
|
266
|
-
}
|
|
267
|
-
# Special case: <wd> contains no <run> but stores the format itself
|
|
268
|
-
else
|
|
269
|
-
{
|
|
270
|
-
my $output = XML::Writer::String->new();
|
|
271
|
-
my $writer = new XML::Writer(OUTPUT => $output, UNSAFE => 'true');
|
|
272
|
-
|
|
273
|
-
# Form the fake <run>
|
|
274
|
-
$writer->startTag( "run",
|
|
275
|
-
$att_list->{ 'FONTFACE' } => GetNodeAttr($child, $att_list->{ 'FONTFACE' }),
|
|
276
|
-
$att_list->{ 'FONTFAMILY' } => GetNodeAttr($child, $att_list->{ 'FONTFAMILY' }),
|
|
277
|
-
$att_list->{ 'FONTPITCH' } => GetNodeAttr($child, $att_list->{ 'FONTPITCH' }),
|
|
278
|
-
$att_list->{ 'FONTSIZE' } => GetNodeAttr($child, $att_list->{ 'FONTSIZE' }),
|
|
279
|
-
$att_list->{ 'SPACING' } => GetNodeAttr($child, $att_list->{ 'SPACING' }),
|
|
280
|
-
$att_list->{ 'SUSCRIPT' } => GetNodeAttr($child, $att_list->{ 'SUSCRIPT' }),
|
|
281
|
-
$att_list->{ 'UNDERLINE' } => GetNodeAttr($child, $att_list->{ 'UNDERLINE' }),
|
|
282
|
-
$att_list->{ 'BOLD' } => GetNodeAttr($child, $att_list->{ 'BOLD' }),
|
|
283
|
-
$att_list->{ 'ITALIC' } => GetNodeAttr($child, $att_list->{ 'ITALIC' }) );
|
|
284
|
-
# Form the fake <wd>
|
|
285
|
-
$writer->startTag( "wd",
|
|
286
|
-
$att_list->{ 'BOTTOM' } => GetNodeAttr($child, $att_list->{ 'BOTTOM' }),
|
|
287
|
-
$att_list->{ 'TOP' } => GetNodeAttr($child, $att_list->{ 'TOP' }),
|
|
288
|
-
$att_list->{ 'LEFT' } => GetNodeAttr($child, $att_list->{ 'LEFT' }),
|
|
289
|
-
$att_list->{ 'RIGHT' } => GetNodeAttr($child, $att_list->{ 'RIGHT' }) );
|
|
290
|
-
# Get the inner <wd> content
|
|
291
|
-
$writer->raw( $child->xml_string() );
|
|
292
|
-
$writer->endTag("wd");
|
|
293
|
-
$writer->endTag("run");
|
|
294
|
-
$writer->end();
|
|
295
|
-
|
|
296
|
-
# Fake run
|
|
297
|
-
my $run = new Omni::Omnirun();
|
|
298
|
-
|
|
299
|
-
# Set raw content
|
|
300
|
-
$run->set_raw($output->value());
|
|
301
|
-
|
|
302
|
-
# Update run list
|
|
303
|
-
push @tmp_objs, $run;
|
|
304
|
-
|
|
305
|
-
# Update content
|
|
306
|
-
$tmp_content = $tmp_content . $run->get_content();
|
|
307
|
-
}
|
|
308
|
-
}
|
|
309
|
-
elsif ($xpath =~ m/\/$space_tag$/)
|
|
310
|
-
{
|
|
311
|
-
# Update content
|
|
312
|
-
$tmp_content = $tmp_content . " ";
|
|
313
|
-
}
|
|
314
|
-
elsif ($xpath =~ m/\/$tab_tag$/)
|
|
315
|
-
{
|
|
316
|
-
# Update content
|
|
317
|
-
$tmp_content = $tmp_content . "\t";
|
|
318
|
-
}
|
|
319
|
-
elsif ($xpath =~ m/\/$newline_tag$/)
|
|
320
|
-
{
|
|
321
|
-
# Update content
|
|
322
|
-
$tmp_content = $tmp_content . "\n";
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
# Little brother
|
|
326
|
-
if ($child->is_last_child)
|
|
327
|
-
{
|
|
328
|
-
last;
|
|
329
|
-
}
|
|
330
|
-
else
|
|
331
|
-
{
|
|
332
|
-
$child = $child->next_sibling();
|
|
333
|
-
}
|
|
334
|
-
}
|
|
335
|
-
}
|
|
336
|
-
}
|
|
337
|
-
|
|
338
|
-
sub get_bullet
|
|
339
|
-
{
|
|
340
|
-
my ($self) = @_;
|
|
341
|
-
return $self->{ '_bullet' };
|
|
342
|
-
}
|
|
343
|
-
|
|
344
|
-
sub set_bullet
|
|
345
|
-
{
|
|
346
|
-
my ($self, $bullet) = @_;
|
|
347
|
-
$self->{ '_bullet' } = $bullet;
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
sub get_name
|
|
351
|
-
{
|
|
352
|
-
my ($self) = @_;
|
|
353
|
-
return $self->{ '_self' };
|
|
354
|
-
}
|
|
355
|
-
|
|
356
|
-
sub get_objs_ref
|
|
357
|
-
{
|
|
358
|
-
my ($self) = @_;
|
|
359
|
-
return $self->{ '_objs' };
|
|
360
|
-
}
|
|
361
|
-
|
|
362
|
-
sub get_content
|
|
363
|
-
{
|
|
364
|
-
my ($self) = @_;
|
|
365
|
-
return $self->{ '_content' };
|
|
366
|
-
}
|
|
367
|
-
|
|
368
|
-
sub get_baseline
|
|
369
|
-
{
|
|
370
|
-
my ($self) = @_;
|
|
371
|
-
return $self->{ '_baseline' };
|
|
372
|
-
}
|
|
373
|
-
|
|
374
|
-
sub get_bottom_pos
|
|
375
|
-
{
|
|
376
|
-
my ($self) = @_;
|
|
377
|
-
return $self->{ '_bottom' };
|
|
378
|
-
}
|
|
379
|
-
|
|
380
|
-
sub get_top_pos
|
|
381
|
-
{
|
|
382
|
-
my ($self) = @_;
|
|
383
|
-
return $self->{ '_top' };
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
sub get_left_pos
|
|
387
|
-
{
|
|
388
|
-
my ($self) = @_;
|
|
389
|
-
return $self->{ '_left' };
|
|
390
|
-
}
|
|
391
|
-
|
|
392
|
-
sub get_right_pos
|
|
393
|
-
{
|
|
394
|
-
my ($self) = @_;
|
|
395
|
-
return $self->{ '_right' };
|
|
396
|
-
}
|
|
397
|
-
|
|
398
|
-
# Support functions
|
|
399
|
-
sub GetNodeAttr
|
|
400
|
-
{
|
|
401
|
-
my ($node, $attr) = @_;
|
|
402
|
-
return ($node->att($attr) ? $node->att($attr) : "");
|
|
403
|
-
}
|
|
404
|
-
|
|
405
|
-
sub SetNodeAttr
|
|
406
|
-
{
|
|
407
|
-
my ($node, $attr, $value) = @_;
|
|
408
|
-
$node->set_att($attr, $value);
|
|
409
|
-
}
|
|
410
|
-
|
|
411
|
-
sub GetNodeText
|
|
412
|
-
{
|
|
413
|
-
my ($node) = @_;
|
|
414
|
-
return $node->text;
|
|
415
|
-
}
|
|
416
|
-
|
|
417
|
-
sub SetNodeText
|
|
418
|
-
{
|
|
419
|
-
my ($node, $value) = @_;
|
|
420
|
-
$node->set_text($value);
|
|
421
|
-
}
|
|
422
|
-
|
|
423
|
-
1;
|
|
@@ -1,282 +0,0 @@
|
|
|
1
|
-
package Omni::Omnipage;
|
|
2
|
-
|
|
3
|
-
# Configuration
|
|
4
|
-
use strict;
|
|
5
|
-
|
|
6
|
-
# Local libraries
|
|
7
|
-
use Omni::Config;
|
|
8
|
-
use Omni::Omnidd;
|
|
9
|
-
use Omni::Omnicol;
|
|
10
|
-
use Omni::Omniframe;
|
|
11
|
-
|
|
12
|
-
# Extern libraries
|
|
13
|
-
use XML::Twig;
|
|
14
|
-
use XML::Parser;
|
|
15
|
-
|
|
16
|
-
# Global variables
|
|
17
|
-
my $tag_list = $Omni::Config::tag_list;
|
|
18
|
-
my $att_list = $Omni::Config::att_list;
|
|
19
|
-
my $obj_list = $Omni::Config::obj_list;
|
|
20
|
-
|
|
21
|
-
# Temporary variables
|
|
22
|
-
my $tmp_content = undef;
|
|
23
|
-
my @tmp_objs = ();
|
|
24
|
-
|
|
25
|
-
###
|
|
26
|
-
# A page object in Omnipage xml: a page contains zero or many collums
|
|
27
|
-
#
|
|
28
|
-
# Do Hoang Nhat Huy, 09 Jan 2011
|
|
29
|
-
###
|
|
30
|
-
# Initialization
|
|
31
|
-
sub new
|
|
32
|
-
{
|
|
33
|
-
my ($class) = @_;
|
|
34
|
-
|
|
35
|
-
# Page: a page can have many columns, many tables, or many images
|
|
36
|
-
my @objs = ();
|
|
37
|
-
|
|
38
|
-
# Class members
|
|
39
|
-
my $self = { '_self' => $obj_list->{ 'OMNIPAGE' },
|
|
40
|
-
'_raw' => undef,
|
|
41
|
-
'_content' => undef,
|
|
42
|
-
'_objs' => \@objs };
|
|
43
|
-
|
|
44
|
-
bless $self, $class;
|
|
45
|
-
return $self;
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
#
|
|
49
|
-
sub set_raw
|
|
50
|
-
{
|
|
51
|
-
my ($self, $raw) = @_;
|
|
52
|
-
|
|
53
|
-
# Save the raw xml <page> ... </page>
|
|
54
|
-
$self->{ '_raw' } = $raw;
|
|
55
|
-
|
|
56
|
-
# Parse the raw string
|
|
57
|
-
my $twig_roots = { $tag_list->{ 'PAGE' } => 1 };
|
|
58
|
-
my $twig_handlers = { $tag_list->{ 'PAGE' } => \&parse};
|
|
59
|
-
|
|
60
|
-
# XML::Twig
|
|
61
|
-
my $twig = new XML::Twig( twig_roots => $twig_roots,
|
|
62
|
-
twig_handlers => $twig_handlers,
|
|
63
|
-
pretty_print => 'indented' );
|
|
64
|
-
|
|
65
|
-
# Start the XML parsing
|
|
66
|
-
$twig->parse($raw);
|
|
67
|
-
$twig->purge;
|
|
68
|
-
|
|
69
|
-
# Copy information from temporary variables to class members
|
|
70
|
-
|
|
71
|
-
# Copy all columns
|
|
72
|
-
@{$self->{ '_objs' } } = @tmp_objs;
|
|
73
|
-
|
|
74
|
-
# Copy content
|
|
75
|
-
$self->{ '_content' } = $tmp_content;
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
sub get_raw
|
|
79
|
-
{
|
|
80
|
-
my ($self) = @_;
|
|
81
|
-
return $self->{ '_raw' };
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
sub parse
|
|
85
|
-
{
|
|
86
|
-
my ($twig, $node) = @_;
|
|
87
|
-
|
|
88
|
-
# At first, content is blank
|
|
89
|
-
$tmp_content = "";
|
|
90
|
-
# because there's no columnm, table or image
|
|
91
|
-
@tmp_objs = ();
|
|
92
|
-
|
|
93
|
-
# Get <page> node attributes
|
|
94
|
-
# At version 16, Omnipage page does not have any interesting atribute
|
|
95
|
-
|
|
96
|
-
my $child = undef;
|
|
97
|
-
# Get the body text
|
|
98
|
-
$child = $node->first_child( $tag_list->{ 'BODY' } );
|
|
99
|
-
# Page with no body, return
|
|
100
|
-
if (! defined $child) { return; }
|
|
101
|
-
|
|
102
|
-
# Get the first child in the body text
|
|
103
|
-
$child = $child->first_child();
|
|
104
|
-
|
|
105
|
-
# The child of <page> is usually <section> but it's not always the case
|
|
106
|
-
my $section_tag = $tag_list->{ 'SECTION' };
|
|
107
|
-
|
|
108
|
-
# <dd>, <col> are usually not the children but the
|
|
109
|
-
# desendents of <page> but I'm not sure about this
|
|
110
|
-
my $dd_tag = $tag_list->{ 'DD' };
|
|
111
|
-
my $column_tag = $tag_list->{ 'COL' };
|
|
112
|
-
my $frame_tag = $tag_list->{ 'FRAME' };
|
|
113
|
-
|
|
114
|
-
# Check if there's any column or dd, what the heck is dd
|
|
115
|
-
while (defined $child)
|
|
116
|
-
{
|
|
117
|
-
my $xpath = $child->path();
|
|
118
|
-
|
|
119
|
-
# if this child is <section>, then <column> and <dd> tag are grandchild of <page>
|
|
120
|
-
if ($xpath =~ m/\/$section_tag$/)
|
|
121
|
-
{
|
|
122
|
-
# Get the first grand child
|
|
123
|
-
my $grand_child = $child->first_child();
|
|
124
|
-
|
|
125
|
-
# Subloop
|
|
126
|
-
while (defined $grand_child)
|
|
127
|
-
{
|
|
128
|
-
my $grand_xpath = $grand_child->path();
|
|
129
|
-
|
|
130
|
-
# if this child is <column>
|
|
131
|
-
if ($grand_xpath =~ m/\/$column_tag$/)
|
|
132
|
-
{
|
|
133
|
-
my $column = new Omni::Omnicol();
|
|
134
|
-
|
|
135
|
-
# Set raw content
|
|
136
|
-
$column->set_raw($grand_child->sprint());
|
|
137
|
-
|
|
138
|
-
# Update column list
|
|
139
|
-
push @tmp_objs, $column;
|
|
140
|
-
|
|
141
|
-
# Update content
|
|
142
|
-
$tmp_content = $tmp_content . $column->get_content() . "\n";
|
|
143
|
-
}
|
|
144
|
-
# if this child is <dd>
|
|
145
|
-
elsif ($grand_xpath =~ m/\/$dd_tag$/)
|
|
146
|
-
{
|
|
147
|
-
my $dd = new Omni::Omnidd();
|
|
148
|
-
|
|
149
|
-
# Set raw content
|
|
150
|
-
$dd->set_raw($child->sprint());
|
|
151
|
-
|
|
152
|
-
# Update column list
|
|
153
|
-
push @tmp_objs, $dd;
|
|
154
|
-
|
|
155
|
-
# Update content
|
|
156
|
-
$tmp_content = $tmp_content . $dd->get_content() . "\n";
|
|
157
|
-
}
|
|
158
|
-
# if this child is <frame>
|
|
159
|
-
elsif ($xpath =~ m/\/$frame_tag$/)
|
|
160
|
-
{
|
|
161
|
-
my $frame = new Omni::Omniframe();
|
|
162
|
-
|
|
163
|
-
# Set raw content
|
|
164
|
-
$frame->set_raw($child->sprint());
|
|
165
|
-
|
|
166
|
-
# Update column list
|
|
167
|
-
push @tmp_objs, $frame;
|
|
168
|
-
|
|
169
|
-
# Update content
|
|
170
|
-
$tmp_content = $tmp_content . $frame->get_content() . "\n";
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
# Little brother
|
|
174
|
-
if ($grand_child->is_last_child)
|
|
175
|
-
{
|
|
176
|
-
last;
|
|
177
|
-
}
|
|
178
|
-
else
|
|
179
|
-
{
|
|
180
|
-
$grand_child = $grand_child->next_sibling();
|
|
181
|
-
}
|
|
182
|
-
}
|
|
183
|
-
}
|
|
184
|
-
# if this child is <column>
|
|
185
|
-
elsif ($xpath =~ m/\/$column_tag$/)
|
|
186
|
-
{
|
|
187
|
-
my $column = new Omni::Omnicol();
|
|
188
|
-
|
|
189
|
-
# Set raw content
|
|
190
|
-
$column->set_raw($child->sprint());
|
|
191
|
-
|
|
192
|
-
# Update column list
|
|
193
|
-
push @tmp_objs, $column;
|
|
194
|
-
|
|
195
|
-
# Update content
|
|
196
|
-
$tmp_content = $tmp_content . $column->get_content() . "\n";
|
|
197
|
-
}
|
|
198
|
-
# if this child is <dd>
|
|
199
|
-
elsif ($xpath =~ m/\/$dd_tag$/)
|
|
200
|
-
{
|
|
201
|
-
my $dd = new Omni::Omnidd();
|
|
202
|
-
|
|
203
|
-
# Set raw content
|
|
204
|
-
$dd->set_raw($child->sprint());
|
|
205
|
-
|
|
206
|
-
# Update column list
|
|
207
|
-
push @tmp_objs, $dd;
|
|
208
|
-
|
|
209
|
-
# Update content
|
|
210
|
-
$tmp_content = $tmp_content . $dd->get_content() . "\n";
|
|
211
|
-
}
|
|
212
|
-
# if this child is <frame>
|
|
213
|
-
elsif ($xpath =~ m/\/$frame_tag$/)
|
|
214
|
-
{
|
|
215
|
-
my $frame = new Omni::Omniframe();
|
|
216
|
-
|
|
217
|
-
# Set raw content
|
|
218
|
-
$frame->set_raw($child->sprint());
|
|
219
|
-
|
|
220
|
-
# Update column list
|
|
221
|
-
push @tmp_objs, $frame;
|
|
222
|
-
|
|
223
|
-
# Update content
|
|
224
|
-
$tmp_content = $tmp_content . $frame->get_content() . "\n";
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
# Little brother
|
|
228
|
-
if ($child->is_last_child)
|
|
229
|
-
{
|
|
230
|
-
last;
|
|
231
|
-
}
|
|
232
|
-
else
|
|
233
|
-
{
|
|
234
|
-
$child = $child->next_sibling();
|
|
235
|
-
}
|
|
236
|
-
}
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
sub get_name
|
|
240
|
-
{
|
|
241
|
-
my ($self) = @_;
|
|
242
|
-
return $self->{ '_self' };
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
sub get_objs_ref
|
|
246
|
-
{
|
|
247
|
-
my ($self) = @_;
|
|
248
|
-
return $self->{ '_objs' };
|
|
249
|
-
}
|
|
250
|
-
|
|
251
|
-
sub get_content
|
|
252
|
-
{
|
|
253
|
-
my ($self) = @_;
|
|
254
|
-
return $self->{ '_content' };
|
|
255
|
-
}
|
|
256
|
-
|
|
257
|
-
# Support functions
|
|
258
|
-
sub GetNodeAttr
|
|
259
|
-
{
|
|
260
|
-
my ($node, $attr) = @_;
|
|
261
|
-
return ($node->att($attr) ? $node->att($attr) : "");
|
|
262
|
-
}
|
|
263
|
-
|
|
264
|
-
sub SetNodeAttr
|
|
265
|
-
{
|
|
266
|
-
my ($node, $attr, $value) = @_;
|
|
267
|
-
$node->set_att($attr, $value);
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
sub GetNodeText
|
|
271
|
-
{
|
|
272
|
-
my ($node) = @_;
|
|
273
|
-
return $node->text;
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
sub SetNodeText
|
|
277
|
-
{
|
|
278
|
-
my ($node, $value) = @_;
|
|
279
|
-
$node->set_text($value);
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
1;
|