biblicit 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,423 +0,0 @@
1
- package Omni::Omniline;
2
-
3
- # Configuration
4
- use strict;
5
-
6
- # Local libraries
7
- use Omni::Config;
8
- use Omni::Omniword;
9
- use Omni::Omnirun;
10
-
11
- # Extern libraries
12
- use XML::Twig;
13
- use XML::Parser;
14
- use XML::Writer;
15
- use XML::Writer::String;
16
-
17
- # Global variables
18
- my $tag_list = $Omni::Config::tag_list;
19
- my $att_list = $Omni::Config::att_list;
20
- my $obj_list = $Omni::Config::obj_list;
21
-
22
- # Temporary variables
23
- my $tmp_content = undef;
24
- my $tmp_baseline = undef;
25
- my $tmp_bottom = undef;
26
- my $tmp_top = undef;
27
- my $tmp_left = undef;
28
- my $tmp_right = undef;
29
- my @tmp_objs = ();
30
-
31
- ###
32
- # A line object in Omnipage xml: a line can contain one or many runs
33
- #
34
- # Do Hoang Nhat Huy, 09 Jan 2011
35
- ###
36
- # Initialization
37
- sub new
38
- {
39
- my ($class) = @_;
40
-
41
- # A line can have multiple runs or words
42
- my @objs = ();
43
-
44
- # Class members
45
- my $self = { '_self' => $obj_list->{ 'OMNILINE' },
46
- '_raw' => undef,
47
- '_content' => undef,
48
- '_baseline' => undef,
49
- '_bottom' => undef,
50
- '_top' => undef,
51
- '_left' => undef,
52
- '_right' => undef,
53
- '_bullet' => undef,
54
- '_objs' => \@objs };
55
-
56
- bless $self, $class;
57
- return $self;
58
- }
59
-
60
- #
61
- sub set_raw
62
- {
63
- my ($self, $raw) = @_;
64
-
65
- # Save the raw xml <ln> ... </ln>
66
- $self->{ '_raw' } = $raw;
67
-
68
- # Parse the raw string
69
- my $twig_roots = { $tag_list->{ 'LINE' } => 1 };
70
- my $twig_handlers = { $tag_list->{ 'LINE' } => \&parse};
71
-
72
- # XML::Twig
73
- my $twig = new XML::Twig( twig_roots => $twig_roots,
74
- twig_handlers => $twig_handlers,
75
- pretty_print => 'indented' );
76
-
77
- # Start the XML parsing
78
- $twig->parse($raw);
79
- $twig->purge;
80
-
81
- # Copy information from temporary variables to class members
82
- $self->{ '_baseline' } = $tmp_baseline;
83
- $self->{ '_bottom' } = $tmp_bottom;
84
- $self->{ '_top' } = $tmp_top;
85
- $self->{ '_left' } = $tmp_left;
86
- $self->{ '_right' } = $tmp_right;
87
-
88
- # Copy all objects
89
- @{ $self->{ '_objs' } } = @tmp_objs;
90
-
91
- # Copy content
92
- $self->{ '_content' } = $tmp_content;
93
- }
94
-
95
- sub get_raw
96
- {
97
- my ($self) = @_;
98
- return $self->{ '_raw' };
99
- }
100
-
101
- sub parse
102
- {
103
- my ($twig, $node) = @_;
104
-
105
- # At first, content is blank
106
- $tmp_content = "";
107
- # because there's no run
108
- @tmp_objs = ();
109
-
110
- # Get <line> node attributes
111
- $tmp_bottom = GetNodeAttr($node, $att_list->{ 'BOTTOM' });
112
- $tmp_top = GetNodeAttr($node, $att_list->{ 'TOP' });
113
- $tmp_left = GetNodeAttr($node, $att_list->{ 'LEFT' });
114
- $tmp_right = GetNodeAttr($node, $att_list->{ 'RIGHT' });
115
- $tmp_baseline = GetNodeAttr($node, $att_list->{ 'BASELINE' });
116
-
117
- # Get <line> node possible attributes
118
- my $tmp_font_face = GetNodeAttr($node, $att_list->{ 'FONTFACE' });
119
- my $tmp_font_family = GetNodeAttr($node, $att_list->{ 'FONTFAMILY' });
120
- my $tmp_font_pitch = GetNodeAttr($node, $att_list->{ 'FONTPITCH' });
121
- my $tmp_font_size = GetNodeAttr($node, $att_list->{ 'FONTSIZE' });
122
- my $tmp_spacing = GetNodeAttr($node, $att_list->{ 'SPACING' });
123
- my $tmp_su_script = GetNodeAttr($node, $att_list->{ 'SUSCRIPT' }); # sub-script or super-script
124
- my $tmp_underline = GetNodeAttr($node, $att_list->{ 'UNDERLINE' });
125
- my $tmp_bold = GetNodeAttr($node, $att_list->{ 'BOLD' });
126
- my $tmp_italic = GetNodeAttr($node, $att_list->{ 'ITALIC' });
127
-
128
- # Check if there's any run
129
- my @all_runs = $node->descendants( $tag_list->{ 'RUN' });
130
- # There is not
131
- if (scalar(@all_runs) == 0)
132
- {
133
- my $output = XML::Writer::String->new();
134
- my $writer = new XML::Writer(OUTPUT => $output, UNSAFE => 'true');
135
-
136
- # Form the fake <run>
137
- $writer->startTag( "run",
138
- $att_list->{ 'FONTFACE' } => $tmp_font_face,
139
- $att_list->{ 'FONTFAMILY' } => $tmp_font_family,
140
- $att_list->{ 'FONTPITCH' } => $tmp_font_pitch,
141
- $att_list->{ 'FONTSIZE' } => $tmp_font_size,
142
- $att_list->{ 'SPACING' } => $tmp_spacing,
143
- $att_list->{ 'SUSCRIPT' } => $tmp_su_script,
144
- $att_list->{ 'UNDERLINE' } => $tmp_underline,
145
- $att_list->{ 'BOLD' } => $tmp_bold,
146
- $att_list->{ 'ITALIC' } => $tmp_italic );
147
-
148
- # Get the inner line content
149
- $writer->raw( $node->xml_string() );
150
- $writer->endTag("run");
151
- $writer->end();
152
-
153
- # Fake run
154
- my $run = new Omni::Omnirun();
155
-
156
- # Set raw content
157
- $run->set_raw($output->value());
158
-
159
- # Update run list
160
- push @tmp_objs, $run;
161
-
162
- # Update content
163
- $tmp_content = $tmp_content . $run->get_content();
164
- }
165
- else
166
- {
167
- # Get the first child in the line
168
- my $child = $node->first_child();
169
-
170
- # Some type of child
171
- my $space_tag = $tag_list->{ 'SPACE' };
172
- my $tab_tag = $tag_list->{ 'TAB' };
173
- my $newline_tag = $tag_list->{ 'NEWLINE' };
174
- my $word_tag = $tag_list->{ 'WORD' };
175
- my $run_tag = $tag_list->{ 'RUN' };
176
-
177
- # A damn line can contain both <run> and <wd>
178
- while (defined $child)
179
- {
180
- my $xpath = $child->path();
181
-
182
- # if this child is <run>
183
- if ($xpath =~ m/\/$run_tag$/)
184
- {
185
- my $run = new Omni::Omnirun();
186
-
187
- # Set raw content
188
- $run->set_raw($child->sprint());
189
-
190
- # Update object list
191
- push @tmp_objs, $run;
192
-
193
- # Update content
194
- $tmp_content = $tmp_content . $run->get_content();
195
- }
196
- # if this child is <wd>
197
- elsif ($xpath =~ m/\/$word_tag$/)
198
- {
199
- # One word can contain many <run>
200
- my $grand_child = $child->first_child( $tag_list->{ 'RUN' } );
201
-
202
- # The first <run> is a special child
203
- if (defined $grand_child)
204
- {
205
- while (defined $grand_child)
206
- {
207
- # NOTE: The following code is controvery. Consider the case when a word
208
- # contain two runs, which means that this word has two parts in two different
209
- # format.
210
- #
211
- # If I want to keep all these format info, I need to consider that this word
212
- # is actually two words with no space between them
213
- #
214
- # But for compatible with Thang's code, there must be only one word here and
215
- # subsequently only one run. So I only keep the first run
216
-
217
- # NOTE: The first <run> in <wd> will has the word position information
218
- my $output = XML::Writer::String->new();
219
- my $writer = new XML::Writer(OUTPUT => $output, UNSAFE => 'true');
220
-
221
- # Form the fake <run>
222
- $writer->startTag( "run",
223
- $att_list->{ 'FONTFACE' } => GetNodeAttr($grand_child, $att_list->{ 'FONTFACE' }),
224
- $att_list->{ 'FONTFAMILY' } => GetNodeAttr($grand_child, $att_list->{ 'FONTFAMILY' }),
225
- $att_list->{ 'FONTPITCH' } => GetNodeAttr($grand_child, $att_list->{ 'FONTPITCH' }),
226
- $att_list->{ 'FONTSIZE' } => GetNodeAttr($grand_child, $att_list->{ 'FONTSIZE' }),
227
- $att_list->{ 'SPACING' } => GetNodeAttr($grand_child, $att_list->{ 'SPACING' }),
228
- $att_list->{ 'SUSCRIPT' } => GetNodeAttr($grand_child, $att_list->{ 'SUSCRIPT' }),
229
- $att_list->{ 'UNDERLINE' } => GetNodeAttr($grand_child, $att_list->{ 'UNDERLINE' }),
230
- $att_list->{ 'BOLD' } => GetNodeAttr($grand_child, $att_list->{ 'BOLD' }),
231
- $att_list->{ 'ITALIC' } => GetNodeAttr($grand_child, $att_list->{ 'ITALIC' }) );
232
- # Form the fake <wd>
233
- $writer->startTag( "wd",
234
- $att_list->{ 'BOTTOM' } => GetNodeAttr($child, $att_list->{ 'BOTTOM' }),
235
- $att_list->{ 'TOP' } => GetNodeAttr($child, $att_list->{ 'TOP' }),
236
- $att_list->{ 'LEFT' } => GetNodeAttr($child, $att_list->{ 'LEFT' }),
237
- $att_list->{ 'RIGHT' } => GetNodeAttr($child, $att_list->{ 'RIGHT' }) );
238
-
239
- $writer->raw( $grand_child->xml_string() );
240
- $writer->endTag("wd");
241
- $writer->endTag("run");
242
- $writer->end();
243
-
244
- # Fake run
245
- my $run = new Omni::Omnirun();
246
-
247
- # Set raw content
248
- $run->set_raw($output->value());
249
-
250
- # Update run list
251
- push @tmp_objs, $run;
252
-
253
- # Update content
254
- $tmp_content = $tmp_content . $run->get_content();
255
-
256
- # Little brother
257
- if ($grand_child->is_last_child)
258
- {
259
- last;
260
- }
261
- else
262
- {
263
- $grand_child = $grand_child->next_sibling( $tag_list->{ 'RUN' } );
264
- }
265
- }
266
- }
267
- # Special case: <wd> contains no <run> but stores the format itself
268
- else
269
- {
270
- my $output = XML::Writer::String->new();
271
- my $writer = new XML::Writer(OUTPUT => $output, UNSAFE => 'true');
272
-
273
- # Form the fake <run>
274
- $writer->startTag( "run",
275
- $att_list->{ 'FONTFACE' } => GetNodeAttr($child, $att_list->{ 'FONTFACE' }),
276
- $att_list->{ 'FONTFAMILY' } => GetNodeAttr($child, $att_list->{ 'FONTFAMILY' }),
277
- $att_list->{ 'FONTPITCH' } => GetNodeAttr($child, $att_list->{ 'FONTPITCH' }),
278
- $att_list->{ 'FONTSIZE' } => GetNodeAttr($child, $att_list->{ 'FONTSIZE' }),
279
- $att_list->{ 'SPACING' } => GetNodeAttr($child, $att_list->{ 'SPACING' }),
280
- $att_list->{ 'SUSCRIPT' } => GetNodeAttr($child, $att_list->{ 'SUSCRIPT' }),
281
- $att_list->{ 'UNDERLINE' } => GetNodeAttr($child, $att_list->{ 'UNDERLINE' }),
282
- $att_list->{ 'BOLD' } => GetNodeAttr($child, $att_list->{ 'BOLD' }),
283
- $att_list->{ 'ITALIC' } => GetNodeAttr($child, $att_list->{ 'ITALIC' }) );
284
- # Form the fake <wd>
285
- $writer->startTag( "wd",
286
- $att_list->{ 'BOTTOM' } => GetNodeAttr($child, $att_list->{ 'BOTTOM' }),
287
- $att_list->{ 'TOP' } => GetNodeAttr($child, $att_list->{ 'TOP' }),
288
- $att_list->{ 'LEFT' } => GetNodeAttr($child, $att_list->{ 'LEFT' }),
289
- $att_list->{ 'RIGHT' } => GetNodeAttr($child, $att_list->{ 'RIGHT' }) );
290
- # Get the inner <wd> content
291
- $writer->raw( $child->xml_string() );
292
- $writer->endTag("wd");
293
- $writer->endTag("run");
294
- $writer->end();
295
-
296
- # Fake run
297
- my $run = new Omni::Omnirun();
298
-
299
- # Set raw content
300
- $run->set_raw($output->value());
301
-
302
- # Update run list
303
- push @tmp_objs, $run;
304
-
305
- # Update content
306
- $tmp_content = $tmp_content . $run->get_content();
307
- }
308
- }
309
- elsif ($xpath =~ m/\/$space_tag$/)
310
- {
311
- # Update content
312
- $tmp_content = $tmp_content . " ";
313
- }
314
- elsif ($xpath =~ m/\/$tab_tag$/)
315
- {
316
- # Update content
317
- $tmp_content = $tmp_content . "\t";
318
- }
319
- elsif ($xpath =~ m/\/$newline_tag$/)
320
- {
321
- # Update content
322
- $tmp_content = $tmp_content . "\n";
323
- }
324
-
325
- # Little brother
326
- if ($child->is_last_child)
327
- {
328
- last;
329
- }
330
- else
331
- {
332
- $child = $child->next_sibling();
333
- }
334
- }
335
- }
336
- }
337
-
338
- sub get_bullet
339
- {
340
- my ($self) = @_;
341
- return $self->{ '_bullet' };
342
- }
343
-
344
- sub set_bullet
345
- {
346
- my ($self, $bullet) = @_;
347
- $self->{ '_bullet' } = $bullet;
348
- }
349
-
350
- sub get_name
351
- {
352
- my ($self) = @_;
353
- return $self->{ '_self' };
354
- }
355
-
356
- sub get_objs_ref
357
- {
358
- my ($self) = @_;
359
- return $self->{ '_objs' };
360
- }
361
-
362
- sub get_content
363
- {
364
- my ($self) = @_;
365
- return $self->{ '_content' };
366
- }
367
-
368
- sub get_baseline
369
- {
370
- my ($self) = @_;
371
- return $self->{ '_baseline' };
372
- }
373
-
374
- sub get_bottom_pos
375
- {
376
- my ($self) = @_;
377
- return $self->{ '_bottom' };
378
- }
379
-
380
- sub get_top_pos
381
- {
382
- my ($self) = @_;
383
- return $self->{ '_top' };
384
- }
385
-
386
- sub get_left_pos
387
- {
388
- my ($self) = @_;
389
- return $self->{ '_left' };
390
- }
391
-
392
- sub get_right_pos
393
- {
394
- my ($self) = @_;
395
- return $self->{ '_right' };
396
- }
397
-
398
- # Support functions
399
- sub GetNodeAttr
400
- {
401
- my ($node, $attr) = @_;
402
- return ($node->att($attr) ? $node->att($attr) : "");
403
- }
404
-
405
- sub SetNodeAttr
406
- {
407
- my ($node, $attr, $value) = @_;
408
- $node->set_att($attr, $value);
409
- }
410
-
411
- sub GetNodeText
412
- {
413
- my ($node) = @_;
414
- return $node->text;
415
- }
416
-
417
- sub SetNodeText
418
- {
419
- my ($node, $value) = @_;
420
- $node->set_text($value);
421
- }
422
-
423
- 1;
@@ -1,282 +0,0 @@
1
- package Omni::Omnipage;
2
-
3
- # Configuration
4
- use strict;
5
-
6
- # Local libraries
7
- use Omni::Config;
8
- use Omni::Omnidd;
9
- use Omni::Omnicol;
10
- use Omni::Omniframe;
11
-
12
- # Extern libraries
13
- use XML::Twig;
14
- use XML::Parser;
15
-
16
- # Global variables
17
- my $tag_list = $Omni::Config::tag_list;
18
- my $att_list = $Omni::Config::att_list;
19
- my $obj_list = $Omni::Config::obj_list;
20
-
21
- # Temporary variables
22
- my $tmp_content = undef;
23
- my @tmp_objs = ();
24
-
25
- ###
26
- # A page object in Omnipage xml: a page contains zero or many collums
27
- #
28
- # Do Hoang Nhat Huy, 09 Jan 2011
29
- ###
30
- # Initialization
31
- sub new
32
- {
33
- my ($class) = @_;
34
-
35
- # Page: a page can have many columns, many tables, or many images
36
- my @objs = ();
37
-
38
- # Class members
39
- my $self = { '_self' => $obj_list->{ 'OMNIPAGE' },
40
- '_raw' => undef,
41
- '_content' => undef,
42
- '_objs' => \@objs };
43
-
44
- bless $self, $class;
45
- return $self;
46
- }
47
-
48
- #
49
- sub set_raw
50
- {
51
- my ($self, $raw) = @_;
52
-
53
- # Save the raw xml <page> ... </page>
54
- $self->{ '_raw' } = $raw;
55
-
56
- # Parse the raw string
57
- my $twig_roots = { $tag_list->{ 'PAGE' } => 1 };
58
- my $twig_handlers = { $tag_list->{ 'PAGE' } => \&parse};
59
-
60
- # XML::Twig
61
- my $twig = new XML::Twig( twig_roots => $twig_roots,
62
- twig_handlers => $twig_handlers,
63
- pretty_print => 'indented' );
64
-
65
- # Start the XML parsing
66
- $twig->parse($raw);
67
- $twig->purge;
68
-
69
- # Copy information from temporary variables to class members
70
-
71
- # Copy all columns
72
- @{$self->{ '_objs' } } = @tmp_objs;
73
-
74
- # Copy content
75
- $self->{ '_content' } = $tmp_content;
76
- }
77
-
78
- sub get_raw
79
- {
80
- my ($self) = @_;
81
- return $self->{ '_raw' };
82
- }
83
-
84
- sub parse
85
- {
86
- my ($twig, $node) = @_;
87
-
88
- # At first, content is blank
89
- $tmp_content = "";
90
- # because there's no columnm, table or image
91
- @tmp_objs = ();
92
-
93
- # Get <page> node attributes
94
- # At version 16, Omnipage page does not have any interesting atribute
95
-
96
- my $child = undef;
97
- # Get the body text
98
- $child = $node->first_child( $tag_list->{ 'BODY' } );
99
- # Page with no body, return
100
- if (! defined $child) { return; }
101
-
102
- # Get the first child in the body text
103
- $child = $child->first_child();
104
-
105
- # The child of <page> is usually <section> but it's not always the case
106
- my $section_tag = $tag_list->{ 'SECTION' };
107
-
108
- # <dd>, <col> are usually not the children but the
109
- # desendents of <page> but I'm not sure about this
110
- my $dd_tag = $tag_list->{ 'DD' };
111
- my $column_tag = $tag_list->{ 'COL' };
112
- my $frame_tag = $tag_list->{ 'FRAME' };
113
-
114
- # Check if there's any column or dd, what the heck is dd
115
- while (defined $child)
116
- {
117
- my $xpath = $child->path();
118
-
119
- # if this child is <section>, then <column> and <dd> tag are grandchild of <page>
120
- if ($xpath =~ m/\/$section_tag$/)
121
- {
122
- # Get the first grand child
123
- my $grand_child = $child->first_child();
124
-
125
- # Subloop
126
- while (defined $grand_child)
127
- {
128
- my $grand_xpath = $grand_child->path();
129
-
130
- # if this child is <column>
131
- if ($grand_xpath =~ m/\/$column_tag$/)
132
- {
133
- my $column = new Omni::Omnicol();
134
-
135
- # Set raw content
136
- $column->set_raw($grand_child->sprint());
137
-
138
- # Update column list
139
- push @tmp_objs, $column;
140
-
141
- # Update content
142
- $tmp_content = $tmp_content . $column->get_content() . "\n";
143
- }
144
- # if this child is <dd>
145
- elsif ($grand_xpath =~ m/\/$dd_tag$/)
146
- {
147
- my $dd = new Omni::Omnidd();
148
-
149
- # Set raw content
150
- $dd->set_raw($child->sprint());
151
-
152
- # Update column list
153
- push @tmp_objs, $dd;
154
-
155
- # Update content
156
- $tmp_content = $tmp_content . $dd->get_content() . "\n";
157
- }
158
- # if this child is <frame>
159
- elsif ($xpath =~ m/\/$frame_tag$/)
160
- {
161
- my $frame = new Omni::Omniframe();
162
-
163
- # Set raw content
164
- $frame->set_raw($child->sprint());
165
-
166
- # Update column list
167
- push @tmp_objs, $frame;
168
-
169
- # Update content
170
- $tmp_content = $tmp_content . $frame->get_content() . "\n";
171
- }
172
-
173
- # Little brother
174
- if ($grand_child->is_last_child)
175
- {
176
- last;
177
- }
178
- else
179
- {
180
- $grand_child = $grand_child->next_sibling();
181
- }
182
- }
183
- }
184
- # if this child is <column>
185
- elsif ($xpath =~ m/\/$column_tag$/)
186
- {
187
- my $column = new Omni::Omnicol();
188
-
189
- # Set raw content
190
- $column->set_raw($child->sprint());
191
-
192
- # Update column list
193
- push @tmp_objs, $column;
194
-
195
- # Update content
196
- $tmp_content = $tmp_content . $column->get_content() . "\n";
197
- }
198
- # if this child is <dd>
199
- elsif ($xpath =~ m/\/$dd_tag$/)
200
- {
201
- my $dd = new Omni::Omnidd();
202
-
203
- # Set raw content
204
- $dd->set_raw($child->sprint());
205
-
206
- # Update column list
207
- push @tmp_objs, $dd;
208
-
209
- # Update content
210
- $tmp_content = $tmp_content . $dd->get_content() . "\n";
211
- }
212
- # if this child is <frame>
213
- elsif ($xpath =~ m/\/$frame_tag$/)
214
- {
215
- my $frame = new Omni::Omniframe();
216
-
217
- # Set raw content
218
- $frame->set_raw($child->sprint());
219
-
220
- # Update column list
221
- push @tmp_objs, $frame;
222
-
223
- # Update content
224
- $tmp_content = $tmp_content . $frame->get_content() . "\n";
225
- }
226
-
227
- # Little brother
228
- if ($child->is_last_child)
229
- {
230
- last;
231
- }
232
- else
233
- {
234
- $child = $child->next_sibling();
235
- }
236
- }
237
- }
238
-
239
- sub get_name
240
- {
241
- my ($self) = @_;
242
- return $self->{ '_self' };
243
- }
244
-
245
- sub get_objs_ref
246
- {
247
- my ($self) = @_;
248
- return $self->{ '_objs' };
249
- }
250
-
251
- sub get_content
252
- {
253
- my ($self) = @_;
254
- return $self->{ '_content' };
255
- }
256
-
257
- # Support functions
258
- sub GetNodeAttr
259
- {
260
- my ($node, $attr) = @_;
261
- return ($node->att($attr) ? $node->att($attr) : "");
262
- }
263
-
264
- sub SetNodeAttr
265
- {
266
- my ($node, $attr, $value) = @_;
267
- $node->set_att($attr, $value);
268
- }
269
-
270
- sub GetNodeText
271
- {
272
- my ($node) = @_;
273
- return $node->text;
274
- }
275
-
276
- sub SetNodeText
277
- {
278
- my ($node, $value) = @_;
279
- $node->set_text($value);
280
- }
281
-
282
- 1;