biblicit 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,313 +0,0 @@
1
- package Omni::Traversal;
2
-
3
- # Configuration
4
- use strict;
5
-
6
- # Local libraries
7
- use Omni::Config;
8
-
9
- # Omnilib configuration: object name
10
- my $obj_list = $Omni::Config::obj_list;
11
-
12
- ###
13
- # 16 Feb 2010: Do Hoang Nhat Huy
14
- # Omni lib is used to handle the Omnipage XML in a generic
15
- # and 'elegent' way. The hierachical structure of the classes
16
- # reflects the XML tree as follow
17
- # Omnidoc
18
- # |
19
- # Omnipage (obviously, one document can have many pages)
20
- # |__________________________________
21
- # || ||
22
- # Omnicol (columns in a page) Omnidd (image?)
23
- # | |
24
- # | Omnidd (nested)
25
- # |_________________________________|
26
- # | |
27
- # Omnipara (a paragraph) Omnitable (a table)
28
- # | |
29
- # | Omnicell (a cell)
30
- # | |
31
- # | Omnipara (a paragraph)
32
- # |_________________________________|
33
- # |
34
- # Omniline (a line)
35
- # |
36
- # Omnirun (a run, text of the same format)
37
- # |
38
- # Omniword (an individual word)
39
- #
40
- # This module provide a generic way to travel the whole tree
41
- # to access each line of the document. Each line will have a
42
- # group of identification (index) as follow
43
- # page id;
44
- # column id or dd id;
45
- # column id or dd id;*
46
- # table id;
47
- # cell id;
48
- # para id;
49
- # line id;
50
- # *: both dd and column can be nested inside each other
51
- ###
52
-
53
- ###
54
- # Huydhn: collect lines whose addresses are selected
55
- ###
56
- sub OmniCollector
57
- {
58
- my ($doc, $line_addrs, $need_obj) = @_;
59
-
60
- # All the line
61
- my @line_content = ();
62
-
63
- # Check the validity
64
- if (scalar(@{ $line_addrs }) == 0) { return (\@line_content); }
65
-
66
- # Current position
67
- my %current = ();
68
- #
69
- my $addr_index = 0;
70
-
71
- # All pages in the document
72
- my $pages = $doc->get_objs_ref();
73
-
74
- # From page, To page
75
- my $start_page = $line_addrs->[ 0 ]->{ 'L1' };
76
- my $end_page = $line_addrs->[ -1 ]->{ 'L1' };
77
-
78
- # Break condition: $line_pos is empty or all lines have been retrieved
79
- my $break = 0;
80
-
81
- # Tree traveling is 'not' fun. Seriously.
82
- # This is like a dungeon seige.
83
- for (my $x = $start_page; $x <= $end_page; $x++)
84
- {
85
- # Column or dd
86
- my $level_2 = $pages->[ $x ]->get_objs_ref();
87
- my $start_l2 = ($x == $line_addrs->[ 0 ]->{ 'L1' }) ?
88
- $line_addrs->[ 0 ]->{ 'L2' } : 0;
89
- my $end_l2 = ($x == $line_addrs->[ -1 ]->{ 'L1' }) ?
90
- $line_addrs->[ -1 ]->{ 'L2' } : (scalar(@{ $level_2 }) - 1);
91
-
92
- for (my $y = $start_l2; $y <= $end_l2; $y++)
93
- {
94
- # Table or paragraph
95
- my $level_3 = $level_2->[ $y ]->get_objs_ref();
96
- my $start_l3 = (($x == $line_addrs->[ 0 ]->{ 'L1' }) && ($y == $line_addrs->[ 0 ]->{ 'L2' })) ?
97
- $line_addrs->[ 0 ]->{ 'L3' } : 0;
98
- my $end_l3 = (($x == $line_addrs->[ -1 ]->{ 'L1' }) && ($y == $line_addrs->[ -1 ]->{ 'L2' })) ?
99
- $line_addrs->[ -1 ]->{ 'L3' } : (scalar(@{ $level_3 }) - 1);
100
-
101
- for (my $z = $start_l3; $z <= $end_l3; $z++)
102
- {
103
- # Is a paragraph
104
- if ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIPARA' })
105
- {
106
- # Line or cell
107
- my $level_4 = $level_3->[ $z ]->get_objs_ref();
108
- my $start_l4 = (($x == $line_addrs->[ 0 ]->{ 'L1' }) && ($y == $line_addrs->[ 0 ]->{ 'L2' }) && ($z == $line_addrs->[ 0 ]->{ 'L3' })) ?
109
- $line_addrs->[ 0 ]->{ 'L4' } : 0;
110
- my $end_l4 = (($x == $line_addrs->[ -1 ]->{ 'L1' }) && ($y == $line_addrs->[ -1 ]->{ 'L2' }) && ($z == $line_addrs->[ -1 ]->{ 'L3' })) ?
111
- $line_addrs->[ -1 ]->{ 'L4' } : (scalar(@{ $level_4 }) - 1);
112
-
113
- # Lines
114
- for (my $t = $start_l4; $t <= $end_l4; $t++)
115
- {
116
- # Only keep selected line
117
- if (($x == $line_addrs->[ $addr_index ]{ 'L1' }) &&
118
- ($y == $line_addrs->[ $addr_index ]{ 'L2' }) &&
119
- ($z == $line_addrs->[ $addr_index ]{ 'L3' }) &&
120
- ($t == $line_addrs->[ $addr_index ]{ 'L4' }))
121
- {
122
- if ((! defined $need_obj) || ($need_obj == 0))
123
- {
124
- push @line_content, $level_4->[ $t ]->get_content();
125
- }
126
- else
127
- {
128
- push @line_content, $level_4->[ $t ];
129
- }
130
-
131
- # Next selected line
132
- $addr_index++;
133
- # Last one?
134
- if ($addr_index == scalar(@{ $line_addrs }))
135
- {
136
- $break = 1;
137
- last;
138
- }
139
- }
140
- }
141
- }
142
- # Is a table
143
- elsif (($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNITABLE' }) || ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIFRAME' }))
144
- {
145
- # TODO: this actually a trick to get it working for now.
146
- # We care not about the cell inside the table but the content
147
- # of the table only. So the table is consider a paragraph in
148
- # which lines are its row
149
- my @level_4 = split(/\n/, $level_3->[ $z ]->get_content());
150
-
151
- for (my $t = 0; $t < scalar(@level_4); $t++)
152
- {
153
- # Current position
154
- $current{ 'L4' } = $t;
155
-
156
- # Only keep selected line
157
- if (($x == $line_addrs->[ $addr_index ]{ 'L1' }) &&
158
- ($y == $line_addrs->[ $addr_index ]{ 'L2' }) &&
159
- ($z == $line_addrs->[ $addr_index ]{ 'L3' }) &&
160
- ($t == $line_addrs->[ $addr_index ]{ 'L4' }))
161
- {
162
- if ((! defined $need_obj) || ($need_obj == 0)) { push @line_content, $level_4[ $t ]; }
163
- # Next selected line
164
- $addr_index++;
165
- # Last one?
166
- if ($addr_index == scalar(@{ $line_addrs }))
167
- {
168
- $break = 1;
169
- last;
170
- }
171
- }
172
- }
173
- }
174
-
175
- # Break or not
176
- if ($break == 1) { last; }
177
- }
178
-
179
- # Break or not
180
- if ($break == 1) { last; }
181
- }
182
-
183
- # Break or not
184
- if ($break == 1) { last; }
185
- }
186
-
187
- return (\@line_content);
188
- }
189
-
190
- ###
191
- # Huydhn: travel the Omnidoc at line level
192
- ###
193
- sub OmniAirline
194
- {
195
- # Omnidoc object
196
- # Starting position
197
- # Ending position
198
- # Both positons are hash with following members
199
- # 'L1' : page
200
- # 'L2' : collumn or dd or frame
201
- # 'L3' : table or paragraph or frame
202
- # 'L4' : line in paragraph or table or frame
203
- my ($doc, $start, $end) = @_;
204
-
205
- # All the line
206
- my @line_pos = ();
207
- my @line_content = ();
208
-
209
- # Current position
210
- my %current = ();
211
-
212
- # All pages in the document
213
- my $pages = $doc->get_objs_ref();
214
-
215
- # From page, To page
216
- my $start_page = (defined $start) ? $start->{ 'L1' } : 0;
217
- my $end_page = (defined $end) ? $end->{ 'L1' } : (scalar(@{ $pages }) - 1);
218
-
219
- # Tree traveling is 'not' fun. Seriously.
220
- # This is like a dungeon seige.
221
- for (my $x = $start_page; $x <= $end_page; $x++)
222
- {
223
- # Current position
224
- $current{ 'L1' } = $x;
225
-
226
- # Column or dd
227
- my $level_2 = $pages->[ $x ]->get_objs_ref();
228
- my $start_l2 = ((defined $start) && ($x == $start->{ 'L1' })) ?
229
- $start->{ 'L2' } : 0;
230
- my $end_l2 = ((defined $end) && ($x == $end->{ 'L1' })) ?
231
- $end->{ 'L2' } : (scalar(@{ $level_2 }) - 1);
232
-
233
- for (my $y = $start_l2; $y <= $end_l2; $y++)
234
- {
235
- # Current position
236
- $current{ 'L2' } = $y;
237
-
238
- # Table or paragraph
239
- my $level_3 = $level_2->[ $y ]->get_objs_ref();
240
- my $start_l3 = ((defined $start) && ($x == $start->{ 'L1' }) && ($y == $start->{ 'L2' })) ?
241
- $start->{ 'L3' } : 0;
242
- my $end_l3 = ((defined $end) &&($x == $end->{ 'L1' }) && ($y == $end->{ 'L2' })) ?
243
- $end->{ 'L3' } : (scalar(@{ $level_3 }) - 1);
244
-
245
- for (my $z = $start_l3; $z <= $end_l3; $z++)
246
- {
247
- # Current position
248
- $current{ 'L3' } = $z;
249
-
250
- # Is a paragraph
251
- if ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIPARA' })
252
- {
253
- # Line or cell
254
- my $level_4 = $level_3->[ $z ]->get_objs_ref();
255
- my $start_l4 = ((defined $start) && ($x == $start->{ 'L1' }) && ($y == $start->{ 'L2' }) && ($z == $start->{ 'L3' })) ?
256
- $start->{ 'L4' } : 0;
257
- my $end_l4 = ((defined $end) && ($x == $end->{ 'L1' }) && ($y == $end->{ 'L2' }) && ($z == $end->{ 'L3' })) ?
258
- $end->{ 'L4' } : (scalar(@{ $level_4 }) - 1);
259
-
260
- # Lines
261
- for (my $t = $start_l4; $t <= $end_l4; $t++)
262
- {
263
- # Current position
264
- $current{ 'L4' } = $t;
265
-
266
- # Only keep non-empty line
267
- my $l = $level_4->[ $t ]->get_content();
268
- $l =~ s/^\s+|\s+$//g;
269
-
270
- if ($l ne "")
271
- {
272
- # Save the current position and the content of the current line
273
- push @line_pos, { %current };
274
- push @line_content, $level_4->[ $t ]->get_content();
275
- }
276
- }
277
- }
278
- # Is a table or frame
279
- elsif (($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNITABLE' }) || ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIFRAME' }))
280
- {
281
- # TODO: this actually a trick to get it working for now.
282
- # We care not about the cell inside the table but the content
283
- # of the table only. So the table is consider a paragraph in
284
- # which lines are its row
285
- my @level_4 = split(/\n/, $level_3->[ $z ]->get_content());
286
-
287
- for (my $t = 0; $t <= scalar(@level_4); $t++)
288
- {
289
- # Current position
290
- $current{ 'L4' } = $t;
291
-
292
- # Only keep non-empty line
293
- my $l = $level_4[ $t ];
294
- $l =~ s/^\s+|\s+$//g;
295
-
296
- if ($l ne "")
297
- {
298
- # Save the current position and the content of the current line
299
- push @line_pos, { %current };
300
- push @line_content, $level_4[ $t ];
301
- }
302
- }
303
- }
304
-
305
-
306
- }
307
- }
308
- }
309
-
310
- return (\@line_pos, \@line_content);
311
- }
312
-
313
- 1;