biblicit 2.0.3 → 2.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,423 +0,0 @@
1
- package Omni::Omniline;
2
-
3
- # Configuration
4
- use strict;
5
-
6
- # Local libraries
7
- use Omni::Config;
8
- use Omni::Omniword;
9
- use Omni::Omnirun;
10
-
11
- # Extern libraries
12
- use XML::Twig;
13
- use XML::Parser;
14
- use XML::Writer;
15
- use XML::Writer::String;
16
-
17
- # Global variables
18
- my $tag_list = $Omni::Config::tag_list;
19
- my $att_list = $Omni::Config::att_list;
20
- my $obj_list = $Omni::Config::obj_list;
21
-
22
- # Temporary variables
23
- my $tmp_content = undef;
24
- my $tmp_baseline = undef;
25
- my $tmp_bottom = undef;
26
- my $tmp_top = undef;
27
- my $tmp_left = undef;
28
- my $tmp_right = undef;
29
- my @tmp_objs = ();
30
-
31
- ###
32
- # A line object in Omnipage xml: a line can contain one or many runs
33
- #
34
- # Do Hoang Nhat Huy, 09 Jan 2011
35
- ###
36
- # Initialization
37
- sub new
38
- {
39
- my ($class) = @_;
40
-
41
- # A line can have multiple runs or words
42
- my @objs = ();
43
-
44
- # Class members
45
- my $self = { '_self' => $obj_list->{ 'OMNILINE' },
46
- '_raw' => undef,
47
- '_content' => undef,
48
- '_baseline' => undef,
49
- '_bottom' => undef,
50
- '_top' => undef,
51
- '_left' => undef,
52
- '_right' => undef,
53
- '_bullet' => undef,
54
- '_objs' => \@objs };
55
-
56
- bless $self, $class;
57
- return $self;
58
- }
59
-
60
- #
61
- sub set_raw
62
- {
63
- my ($self, $raw) = @_;
64
-
65
- # Save the raw xml <ln> ... </ln>
66
- $self->{ '_raw' } = $raw;
67
-
68
- # Parse the raw string
69
- my $twig_roots = { $tag_list->{ 'LINE' } => 1 };
70
- my $twig_handlers = { $tag_list->{ 'LINE' } => \&parse};
71
-
72
- # XML::Twig
73
- my $twig = new XML::Twig( twig_roots => $twig_roots,
74
- twig_handlers => $twig_handlers,
75
- pretty_print => 'indented' );
76
-
77
- # Start the XML parsing
78
- $twig->parse($raw);
79
- $twig->purge;
80
-
81
- # Copy information from temporary variables to class members
82
- $self->{ '_baseline' } = $tmp_baseline;
83
- $self->{ '_bottom' } = $tmp_bottom;
84
- $self->{ '_top' } = $tmp_top;
85
- $self->{ '_left' } = $tmp_left;
86
- $self->{ '_right' } = $tmp_right;
87
-
88
- # Copy all objects
89
- @{ $self->{ '_objs' } } = @tmp_objs;
90
-
91
- # Copy content
92
- $self->{ '_content' } = $tmp_content;
93
- }
94
-
95
- sub get_raw
96
- {
97
- my ($self) = @_;
98
- return $self->{ '_raw' };
99
- }
100
-
101
- sub parse
102
- {
103
- my ($twig, $node) = @_;
104
-
105
- # At first, content is blank
106
- $tmp_content = "";
107
- # because there's no run
108
- @tmp_objs = ();
109
-
110
- # Get <line> node attributes
111
- $tmp_bottom = GetNodeAttr($node, $att_list->{ 'BOTTOM' });
112
- $tmp_top = GetNodeAttr($node, $att_list->{ 'TOP' });
113
- $tmp_left = GetNodeAttr($node, $att_list->{ 'LEFT' });
114
- $tmp_right = GetNodeAttr($node, $att_list->{ 'RIGHT' });
115
- $tmp_baseline = GetNodeAttr($node, $att_list->{ 'BASELINE' });
116
-
117
- # Get <line> node possible attributes
118
- my $tmp_font_face = GetNodeAttr($node, $att_list->{ 'FONTFACE' });
119
- my $tmp_font_family = GetNodeAttr($node, $att_list->{ 'FONTFAMILY' });
120
- my $tmp_font_pitch = GetNodeAttr($node, $att_list->{ 'FONTPITCH' });
121
- my $tmp_font_size = GetNodeAttr($node, $att_list->{ 'FONTSIZE' });
122
- my $tmp_spacing = GetNodeAttr($node, $att_list->{ 'SPACING' });
123
- my $tmp_su_script = GetNodeAttr($node, $att_list->{ 'SUSCRIPT' }); # sub-script or super-script
124
- my $tmp_underline = GetNodeAttr($node, $att_list->{ 'UNDERLINE' });
125
- my $tmp_bold = GetNodeAttr($node, $att_list->{ 'BOLD' });
126
- my $tmp_italic = GetNodeAttr($node, $att_list->{ 'ITALIC' });
127
-
128
- # Check if there's any run
129
- my @all_runs = $node->descendants( $tag_list->{ 'RUN' });
130
- # There is not
131
- if (scalar(@all_runs) == 0)
132
- {
133
- my $output = XML::Writer::String->new();
134
- my $writer = new XML::Writer(OUTPUT => $output, UNSAFE => 'true');
135
-
136
- # Form the fake <run>
137
- $writer->startTag( "run",
138
- $att_list->{ 'FONTFACE' } => $tmp_font_face,
139
- $att_list->{ 'FONTFAMILY' } => $tmp_font_family,
140
- $att_list->{ 'FONTPITCH' } => $tmp_font_pitch,
141
- $att_list->{ 'FONTSIZE' } => $tmp_font_size,
142
- $att_list->{ 'SPACING' } => $tmp_spacing,
143
- $att_list->{ 'SUSCRIPT' } => $tmp_su_script,
144
- $att_list->{ 'UNDERLINE' } => $tmp_underline,
145
- $att_list->{ 'BOLD' } => $tmp_bold,
146
- $att_list->{ 'ITALIC' } => $tmp_italic );
147
-
148
- # Get the inner line content
149
- $writer->raw( $node->xml_string() );
150
- $writer->endTag("run");
151
- $writer->end();
152
-
153
- # Fake run
154
- my $run = new Omni::Omnirun();
155
-
156
- # Set raw content
157
- $run->set_raw($output->value());
158
-
159
- # Update run list
160
- push @tmp_objs, $run;
161
-
162
- # Update content
163
- $tmp_content = $tmp_content . $run->get_content();
164
- }
165
- else
166
- {
167
- # Get the first child in the line
168
- my $child = $node->first_child();
169
-
170
- # Some type of child
171
- my $space_tag = $tag_list->{ 'SPACE' };
172
- my $tab_tag = $tag_list->{ 'TAB' };
173
- my $newline_tag = $tag_list->{ 'NEWLINE' };
174
- my $word_tag = $tag_list->{ 'WORD' };
175
- my $run_tag = $tag_list->{ 'RUN' };
176
-
177
- # A damn line can contain both <run> and <wd>
178
- while (defined $child)
179
- {
180
- my $xpath = $child->path();
181
-
182
- # if this child is <run>
183
- if ($xpath =~ m/\/$run_tag$/)
184
- {
185
- my $run = new Omni::Omnirun();
186
-
187
- # Set raw content
188
- $run->set_raw($child->sprint());
189
-
190
- # Update object list
191
- push @tmp_objs, $run;
192
-
193
- # Update content
194
- $tmp_content = $tmp_content . $run->get_content();
195
- }
196
- # if this child is <wd>
197
- elsif ($xpath =~ m/\/$word_tag$/)
198
- {
199
- # One word can contain many <run>
200
- my $grand_child = $child->first_child( $tag_list->{ 'RUN' } );
201
-
202
- # The first <run> is a special child
203
- if (defined $grand_child)
204
- {
205
- while (defined $grand_child)
206
- {
207
- # NOTE: The following code is controvery. Consider the case when a word
208
- # contain two runs, which means that this word has two parts in two different
209
- # format.
210
- #
211
- # If I want to keep all these format info, I need to consider that this word
212
- # is actually two words with no space between them
213
- #
214
- # But for compatible with Thang's code, there must be only one word here and
215
- # subsequently only one run. So I only keep the first run
216
-
217
- # NOTE: The first <run> in <wd> will has the word position information
218
- my $output = XML::Writer::String->new();
219
- my $writer = new XML::Writer(OUTPUT => $output, UNSAFE => 'true');
220
-
221
- # Form the fake <run>
222
- $writer->startTag( "run",
223
- $att_list->{ 'FONTFACE' } => GetNodeAttr($grand_child, $att_list->{ 'FONTFACE' }),
224
- $att_list->{ 'FONTFAMILY' } => GetNodeAttr($grand_child, $att_list->{ 'FONTFAMILY' }),
225
- $att_list->{ 'FONTPITCH' } => GetNodeAttr($grand_child, $att_list->{ 'FONTPITCH' }),
226
- $att_list->{ 'FONTSIZE' } => GetNodeAttr($grand_child, $att_list->{ 'FONTSIZE' }),
227
- $att_list->{ 'SPACING' } => GetNodeAttr($grand_child, $att_list->{ 'SPACING' }),
228
- $att_list->{ 'SUSCRIPT' } => GetNodeAttr($grand_child, $att_list->{ 'SUSCRIPT' }),
229
- $att_list->{ 'UNDERLINE' } => GetNodeAttr($grand_child, $att_list->{ 'UNDERLINE' }),
230
- $att_list->{ 'BOLD' } => GetNodeAttr($grand_child, $att_list->{ 'BOLD' }),
231
- $att_list->{ 'ITALIC' } => GetNodeAttr($grand_child, $att_list->{ 'ITALIC' }) );
232
- # Form the fake <wd>
233
- $writer->startTag( "wd",
234
- $att_list->{ 'BOTTOM' } => GetNodeAttr($child, $att_list->{ 'BOTTOM' }),
235
- $att_list->{ 'TOP' } => GetNodeAttr($child, $att_list->{ 'TOP' }),
236
- $att_list->{ 'LEFT' } => GetNodeAttr($child, $att_list->{ 'LEFT' }),
237
- $att_list->{ 'RIGHT' } => GetNodeAttr($child, $att_list->{ 'RIGHT' }) );
238
-
239
- $writer->raw( $grand_child->xml_string() );
240
- $writer->endTag("wd");
241
- $writer->endTag("run");
242
- $writer->end();
243
-
244
- # Fake run
245
- my $run = new Omni::Omnirun();
246
-
247
- # Set raw content
248
- $run->set_raw($output->value());
249
-
250
- # Update run list
251
- push @tmp_objs, $run;
252
-
253
- # Update content
254
- $tmp_content = $tmp_content . $run->get_content();
255
-
256
- # Little brother
257
- if ($grand_child->is_last_child)
258
- {
259
- last;
260
- }
261
- else
262
- {
263
- $grand_child = $grand_child->next_sibling( $tag_list->{ 'RUN' } );
264
- }
265
- }
266
- }
267
- # Special case: <wd> contains no <run> but stores the format itself
268
- else
269
- {
270
- my $output = XML::Writer::String->new();
271
- my $writer = new XML::Writer(OUTPUT => $output, UNSAFE => 'true');
272
-
273
- # Form the fake <run>
274
- $writer->startTag( "run",
275
- $att_list->{ 'FONTFACE' } => GetNodeAttr($child, $att_list->{ 'FONTFACE' }),
276
- $att_list->{ 'FONTFAMILY' } => GetNodeAttr($child, $att_list->{ 'FONTFAMILY' }),
277
- $att_list->{ 'FONTPITCH' } => GetNodeAttr($child, $att_list->{ 'FONTPITCH' }),
278
- $att_list->{ 'FONTSIZE' } => GetNodeAttr($child, $att_list->{ 'FONTSIZE' }),
279
- $att_list->{ 'SPACING' } => GetNodeAttr($child, $att_list->{ 'SPACING' }),
280
- $att_list->{ 'SUSCRIPT' } => GetNodeAttr($child, $att_list->{ 'SUSCRIPT' }),
281
- $att_list->{ 'UNDERLINE' } => GetNodeAttr($child, $att_list->{ 'UNDERLINE' }),
282
- $att_list->{ 'BOLD' } => GetNodeAttr($child, $att_list->{ 'BOLD' }),
283
- $att_list->{ 'ITALIC' } => GetNodeAttr($child, $att_list->{ 'ITALIC' }) );
284
- # Form the fake <wd>
285
- $writer->startTag( "wd",
286
- $att_list->{ 'BOTTOM' } => GetNodeAttr($child, $att_list->{ 'BOTTOM' }),
287
- $att_list->{ 'TOP' } => GetNodeAttr($child, $att_list->{ 'TOP' }),
288
- $att_list->{ 'LEFT' } => GetNodeAttr($child, $att_list->{ 'LEFT' }),
289
- $att_list->{ 'RIGHT' } => GetNodeAttr($child, $att_list->{ 'RIGHT' }) );
290
- # Get the inner <wd> content
291
- $writer->raw( $child->xml_string() );
292
- $writer->endTag("wd");
293
- $writer->endTag("run");
294
- $writer->end();
295
-
296
- # Fake run
297
- my $run = new Omni::Omnirun();
298
-
299
- # Set raw content
300
- $run->set_raw($output->value());
301
-
302
- # Update run list
303
- push @tmp_objs, $run;
304
-
305
- # Update content
306
- $tmp_content = $tmp_content . $run->get_content();
307
- }
308
- }
309
- elsif ($xpath =~ m/\/$space_tag$/)
310
- {
311
- # Update content
312
- $tmp_content = $tmp_content . " ";
313
- }
314
- elsif ($xpath =~ m/\/$tab_tag$/)
315
- {
316
- # Update content
317
- $tmp_content = $tmp_content . "\t";
318
- }
319
- elsif ($xpath =~ m/\/$newline_tag$/)
320
- {
321
- # Update content
322
- $tmp_content = $tmp_content . "\n";
323
- }
324
-
325
- # Little brother
326
- if ($child->is_last_child)
327
- {
328
- last;
329
- }
330
- else
331
- {
332
- $child = $child->next_sibling();
333
- }
334
- }
335
- }
336
- }
337
-
338
- sub get_bullet
339
- {
340
- my ($self) = @_;
341
- return $self->{ '_bullet' };
342
- }
343
-
344
- sub set_bullet
345
- {
346
- my ($self, $bullet) = @_;
347
- $self->{ '_bullet' } = $bullet;
348
- }
349
-
350
- sub get_name
351
- {
352
- my ($self) = @_;
353
- return $self->{ '_self' };
354
- }
355
-
356
- sub get_objs_ref
357
- {
358
- my ($self) = @_;
359
- return $self->{ '_objs' };
360
- }
361
-
362
- sub get_content
363
- {
364
- my ($self) = @_;
365
- return $self->{ '_content' };
366
- }
367
-
368
- sub get_baseline
369
- {
370
- my ($self) = @_;
371
- return $self->{ '_baseline' };
372
- }
373
-
374
- sub get_bottom_pos
375
- {
376
- my ($self) = @_;
377
- return $self->{ '_bottom' };
378
- }
379
-
380
- sub get_top_pos
381
- {
382
- my ($self) = @_;
383
- return $self->{ '_top' };
384
- }
385
-
386
- sub get_left_pos
387
- {
388
- my ($self) = @_;
389
- return $self->{ '_left' };
390
- }
391
-
392
- sub get_right_pos
393
- {
394
- my ($self) = @_;
395
- return $self->{ '_right' };
396
- }
397
-
398
- # Support functions
399
- sub GetNodeAttr
400
- {
401
- my ($node, $attr) = @_;
402
- return ($node->att($attr) ? $node->att($attr) : "");
403
- }
404
-
405
- sub SetNodeAttr
406
- {
407
- my ($node, $attr, $value) = @_;
408
- $node->set_att($attr, $value);
409
- }
410
-
411
- sub GetNodeText
412
- {
413
- my ($node) = @_;
414
- return $node->text;
415
- }
416
-
417
- sub SetNodeText
418
- {
419
- my ($node, $value) = @_;
420
- $node->set_text($value);
421
- }
422
-
423
- 1;
@@ -1,282 +0,0 @@
1
- package Omni::Omnipage;
2
-
3
- # Configuration
4
- use strict;
5
-
6
- # Local libraries
7
- use Omni::Config;
8
- use Omni::Omnidd;
9
- use Omni::Omnicol;
10
- use Omni::Omniframe;
11
-
12
- # Extern libraries
13
- use XML::Twig;
14
- use XML::Parser;
15
-
16
- # Global variables
17
- my $tag_list = $Omni::Config::tag_list;
18
- my $att_list = $Omni::Config::att_list;
19
- my $obj_list = $Omni::Config::obj_list;
20
-
21
- # Temporary variables
22
- my $tmp_content = undef;
23
- my @tmp_objs = ();
24
-
25
- ###
26
- # A page object in Omnipage xml: a page contains zero or many collums
27
- #
28
- # Do Hoang Nhat Huy, 09 Jan 2011
29
- ###
30
- # Initialization
31
- sub new
32
- {
33
- my ($class) = @_;
34
-
35
- # Page: a page can have many columns, many tables, or many images
36
- my @objs = ();
37
-
38
- # Class members
39
- my $self = { '_self' => $obj_list->{ 'OMNIPAGE' },
40
- '_raw' => undef,
41
- '_content' => undef,
42
- '_objs' => \@objs };
43
-
44
- bless $self, $class;
45
- return $self;
46
- }
47
-
48
- #
49
- sub set_raw
50
- {
51
- my ($self, $raw) = @_;
52
-
53
- # Save the raw xml <page> ... </page>
54
- $self->{ '_raw' } = $raw;
55
-
56
- # Parse the raw string
57
- my $twig_roots = { $tag_list->{ 'PAGE' } => 1 };
58
- my $twig_handlers = { $tag_list->{ 'PAGE' } => \&parse};
59
-
60
- # XML::Twig
61
- my $twig = new XML::Twig( twig_roots => $twig_roots,
62
- twig_handlers => $twig_handlers,
63
- pretty_print => 'indented' );
64
-
65
- # Start the XML parsing
66
- $twig->parse($raw);
67
- $twig->purge;
68
-
69
- # Copy information from temporary variables to class members
70
-
71
- # Copy all columns
72
- @{$self->{ '_objs' } } = @tmp_objs;
73
-
74
- # Copy content
75
- $self->{ '_content' } = $tmp_content;
76
- }
77
-
78
- sub get_raw
79
- {
80
- my ($self) = @_;
81
- return $self->{ '_raw' };
82
- }
83
-
84
- sub parse
85
- {
86
- my ($twig, $node) = @_;
87
-
88
- # At first, content is blank
89
- $tmp_content = "";
90
- # because there's no columnm, table or image
91
- @tmp_objs = ();
92
-
93
- # Get <page> node attributes
94
- # At version 16, Omnipage page does not have any interesting atribute
95
-
96
- my $child = undef;
97
- # Get the body text
98
- $child = $node->first_child( $tag_list->{ 'BODY' } );
99
- # Page with no body, return
100
- if (! defined $child) { return; }
101
-
102
- # Get the first child in the body text
103
- $child = $child->first_child();
104
-
105
- # The child of <page> is usually <section> but it's not always the case
106
- my $section_tag = $tag_list->{ 'SECTION' };
107
-
108
- # <dd>, <col> are usually not the children but the
109
- # desendents of <page> but I'm not sure about this
110
- my $dd_tag = $tag_list->{ 'DD' };
111
- my $column_tag = $tag_list->{ 'COL' };
112
- my $frame_tag = $tag_list->{ 'FRAME' };
113
-
114
- # Check if there's any column or dd, what the heck is dd
115
- while (defined $child)
116
- {
117
- my $xpath = $child->path();
118
-
119
- # if this child is <section>, then <column> and <dd> tag are grandchild of <page>
120
- if ($xpath =~ m/\/$section_tag$/)
121
- {
122
- # Get the first grand child
123
- my $grand_child = $child->first_child();
124
-
125
- # Subloop
126
- while (defined $grand_child)
127
- {
128
- my $grand_xpath = $grand_child->path();
129
-
130
- # if this child is <column>
131
- if ($grand_xpath =~ m/\/$column_tag$/)
132
- {
133
- my $column = new Omni::Omnicol();
134
-
135
- # Set raw content
136
- $column->set_raw($grand_child->sprint());
137
-
138
- # Update column list
139
- push @tmp_objs, $column;
140
-
141
- # Update content
142
- $tmp_content = $tmp_content . $column->get_content() . "\n";
143
- }
144
- # if this child is <dd>
145
- elsif ($grand_xpath =~ m/\/$dd_tag$/)
146
- {
147
- my $dd = new Omni::Omnidd();
148
-
149
- # Set raw content
150
- $dd->set_raw($child->sprint());
151
-
152
- # Update column list
153
- push @tmp_objs, $dd;
154
-
155
- # Update content
156
- $tmp_content = $tmp_content . $dd->get_content() . "\n";
157
- }
158
- # if this child is <frame>
159
- elsif ($xpath =~ m/\/$frame_tag$/)
160
- {
161
- my $frame = new Omni::Omniframe();
162
-
163
- # Set raw content
164
- $frame->set_raw($child->sprint());
165
-
166
- # Update column list
167
- push @tmp_objs, $frame;
168
-
169
- # Update content
170
- $tmp_content = $tmp_content . $frame->get_content() . "\n";
171
- }
172
-
173
- # Little brother
174
- if ($grand_child->is_last_child)
175
- {
176
- last;
177
- }
178
- else
179
- {
180
- $grand_child = $grand_child->next_sibling();
181
- }
182
- }
183
- }
184
- # if this child is <column>
185
- elsif ($xpath =~ m/\/$column_tag$/)
186
- {
187
- my $column = new Omni::Omnicol();
188
-
189
- # Set raw content
190
- $column->set_raw($child->sprint());
191
-
192
- # Update column list
193
- push @tmp_objs, $column;
194
-
195
- # Update content
196
- $tmp_content = $tmp_content . $column->get_content() . "\n";
197
- }
198
- # if this child is <dd>
199
- elsif ($xpath =~ m/\/$dd_tag$/)
200
- {
201
- my $dd = new Omni::Omnidd();
202
-
203
- # Set raw content
204
- $dd->set_raw($child->sprint());
205
-
206
- # Update column list
207
- push @tmp_objs, $dd;
208
-
209
- # Update content
210
- $tmp_content = $tmp_content . $dd->get_content() . "\n";
211
- }
212
- # if this child is <frame>
213
- elsif ($xpath =~ m/\/$frame_tag$/)
214
- {
215
- my $frame = new Omni::Omniframe();
216
-
217
- # Set raw content
218
- $frame->set_raw($child->sprint());
219
-
220
- # Update column list
221
- push @tmp_objs, $frame;
222
-
223
- # Update content
224
- $tmp_content = $tmp_content . $frame->get_content() . "\n";
225
- }
226
-
227
- # Little brother
228
- if ($child->is_last_child)
229
- {
230
- last;
231
- }
232
- else
233
- {
234
- $child = $child->next_sibling();
235
- }
236
- }
237
- }
238
-
239
- sub get_name
240
- {
241
- my ($self) = @_;
242
- return $self->{ '_self' };
243
- }
244
-
245
- sub get_objs_ref
246
- {
247
- my ($self) = @_;
248
- return $self->{ '_objs' };
249
- }
250
-
251
- sub get_content
252
- {
253
- my ($self) = @_;
254
- return $self->{ '_content' };
255
- }
256
-
257
- # Support functions
258
- sub GetNodeAttr
259
- {
260
- my ($node, $attr) = @_;
261
- return ($node->att($attr) ? $node->att($attr) : "");
262
- }
263
-
264
- sub SetNodeAttr
265
- {
266
- my ($node, $attr, $value) = @_;
267
- $node->set_att($attr, $value);
268
- }
269
-
270
- sub GetNodeText
271
- {
272
- my ($node) = @_;
273
- return $node->text;
274
- }
275
-
276
- sub SetNodeText
277
- {
278
- my ($node, $value) = @_;
279
- $node->set_text($value);
280
- }
281
-
282
- 1;