biblicit 2.0.3 → 2.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,313 +0,0 @@
1
- package Omni::Traversal;
2
-
3
- # Configuration
4
- use strict;
5
-
6
- # Local libraries
7
- use Omni::Config;
8
-
9
- # Omnilib configuration: object name
10
- my $obj_list = $Omni::Config::obj_list;
11
-
12
- ###
13
- # 16 Feb 2010: Do Hoang Nhat Huy
14
- # Omni lib is used to handle the Omnipage XML in a generic
15
- # and 'elegent' way. The hierachical structure of the classes
16
- # reflects the XML tree as follow
17
- # Omnidoc
18
- # |
19
- # Omnipage (obviously, one document can have many pages)
20
- # |__________________________________
21
- # || ||
22
- # Omnicol (columns in a page) Omnidd (image?)
23
- # | |
24
- # | Omnidd (nested)
25
- # |_________________________________|
26
- # | |
27
- # Omnipara (a paragraph) Omnitable (a table)
28
- # | |
29
- # | Omnicell (a cell)
30
- # | |
31
- # | Omnipara (a paragraph)
32
- # |_________________________________|
33
- # |
34
- # Omniline (a line)
35
- # |
36
- # Omnirun (a run, text of the same format)
37
- # |
38
- # Omniword (an individual word)
39
- #
40
- # This module provide a generic way to travel the whole tree
41
- # to access each line of the document. Each line will have a
42
- # group of identification (index) as follow
43
- # page id;
44
- # column id or dd id;
45
- # column id or dd id;*
46
- # table id;
47
- # cell id;
48
- # para id;
49
- # line id;
50
- # *: both dd and column can be nested inside each other
51
- ###
52
-
53
- ###
54
- # Huydhn: collect lines whose addresses are selected
55
- ###
56
- sub OmniCollector
57
- {
58
- my ($doc, $line_addrs, $need_obj) = @_;
59
-
60
- # All the line
61
- my @line_content = ();
62
-
63
- # Check the validity
64
- if (scalar(@{ $line_addrs }) == 0) { return (\@line_content); }
65
-
66
- # Current position
67
- my %current = ();
68
- #
69
- my $addr_index = 0;
70
-
71
- # All pages in the document
72
- my $pages = $doc->get_objs_ref();
73
-
74
- # From page, To page
75
- my $start_page = $line_addrs->[ 0 ]->{ 'L1' };
76
- my $end_page = $line_addrs->[ -1 ]->{ 'L1' };
77
-
78
- # Break condition: $line_pos is empty or all lines have been retrieved
79
- my $break = 0;
80
-
81
- # Tree traveling is 'not' fun. Seriously.
82
- # This is like a dungeon seige.
83
- for (my $x = $start_page; $x <= $end_page; $x++)
84
- {
85
- # Column or dd
86
- my $level_2 = $pages->[ $x ]->get_objs_ref();
87
- my $start_l2 = ($x == $line_addrs->[ 0 ]->{ 'L1' }) ?
88
- $line_addrs->[ 0 ]->{ 'L2' } : 0;
89
- my $end_l2 = ($x == $line_addrs->[ -1 ]->{ 'L1' }) ?
90
- $line_addrs->[ -1 ]->{ 'L2' } : (scalar(@{ $level_2 }) - 1);
91
-
92
- for (my $y = $start_l2; $y <= $end_l2; $y++)
93
- {
94
- # Table or paragraph
95
- my $level_3 = $level_2->[ $y ]->get_objs_ref();
96
- my $start_l3 = (($x == $line_addrs->[ 0 ]->{ 'L1' }) && ($y == $line_addrs->[ 0 ]->{ 'L2' })) ?
97
- $line_addrs->[ 0 ]->{ 'L3' } : 0;
98
- my $end_l3 = (($x == $line_addrs->[ -1 ]->{ 'L1' }) && ($y == $line_addrs->[ -1 ]->{ 'L2' })) ?
99
- $line_addrs->[ -1 ]->{ 'L3' } : (scalar(@{ $level_3 }) - 1);
100
-
101
- for (my $z = $start_l3; $z <= $end_l3; $z++)
102
- {
103
- # Is a paragraph
104
- if ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIPARA' })
105
- {
106
- # Line or cell
107
- my $level_4 = $level_3->[ $z ]->get_objs_ref();
108
- my $start_l4 = (($x == $line_addrs->[ 0 ]->{ 'L1' }) && ($y == $line_addrs->[ 0 ]->{ 'L2' }) && ($z == $line_addrs->[ 0 ]->{ 'L3' })) ?
109
- $line_addrs->[ 0 ]->{ 'L4' } : 0;
110
- my $end_l4 = (($x == $line_addrs->[ -1 ]->{ 'L1' }) && ($y == $line_addrs->[ -1 ]->{ 'L2' }) && ($z == $line_addrs->[ -1 ]->{ 'L3' })) ?
111
- $line_addrs->[ -1 ]->{ 'L4' } : (scalar(@{ $level_4 }) - 1);
112
-
113
- # Lines
114
- for (my $t = $start_l4; $t <= $end_l4; $t++)
115
- {
116
- # Only keep selected line
117
- if (($x == $line_addrs->[ $addr_index ]{ 'L1' }) &&
118
- ($y == $line_addrs->[ $addr_index ]{ 'L2' }) &&
119
- ($z == $line_addrs->[ $addr_index ]{ 'L3' }) &&
120
- ($t == $line_addrs->[ $addr_index ]{ 'L4' }))
121
- {
122
- if ((! defined $need_obj) || ($need_obj == 0))
123
- {
124
- push @line_content, $level_4->[ $t ]->get_content();
125
- }
126
- else
127
- {
128
- push @line_content, $level_4->[ $t ];
129
- }
130
-
131
- # Next selected line
132
- $addr_index++;
133
- # Last one?
134
- if ($addr_index == scalar(@{ $line_addrs }))
135
- {
136
- $break = 1;
137
- last;
138
- }
139
- }
140
- }
141
- }
142
- # Is a table
143
- elsif (($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNITABLE' }) || ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIFRAME' }))
144
- {
145
- # TODO: this actually a trick to get it working for now.
146
- # We care not about the cell inside the table but the content
147
- # of the table only. So the table is consider a paragraph in
148
- # which lines are its row
149
- my @level_4 = split(/\n/, $level_3->[ $z ]->get_content());
150
-
151
- for (my $t = 0; $t < scalar(@level_4); $t++)
152
- {
153
- # Current position
154
- $current{ 'L4' } = $t;
155
-
156
- # Only keep selected line
157
- if (($x == $line_addrs->[ $addr_index ]{ 'L1' }) &&
158
- ($y == $line_addrs->[ $addr_index ]{ 'L2' }) &&
159
- ($z == $line_addrs->[ $addr_index ]{ 'L3' }) &&
160
- ($t == $line_addrs->[ $addr_index ]{ 'L4' }))
161
- {
162
- if ((! defined $need_obj) || ($need_obj == 0)) { push @line_content, $level_4[ $t ]; }
163
- # Next selected line
164
- $addr_index++;
165
- # Last one?
166
- if ($addr_index == scalar(@{ $line_addrs }))
167
- {
168
- $break = 1;
169
- last;
170
- }
171
- }
172
- }
173
- }
174
-
175
- # Break or not
176
- if ($break == 1) { last; }
177
- }
178
-
179
- # Break or not
180
- if ($break == 1) { last; }
181
- }
182
-
183
- # Break or not
184
- if ($break == 1) { last; }
185
- }
186
-
187
- return (\@line_content);
188
- }
189
-
190
- ###
191
- # Huydhn: travel the Omnidoc at line level
192
- ###
193
- sub OmniAirline
194
- {
195
- # Omnidoc object
196
- # Starting position
197
- # Ending position
198
- # Both positons are hash with following members
199
- # 'L1' : page
200
- # 'L2' : collumn or dd or frame
201
- # 'L3' : table or paragraph or frame
202
- # 'L4' : line in paragraph or table or frame
203
- my ($doc, $start, $end) = @_;
204
-
205
- # All the line
206
- my @line_pos = ();
207
- my @line_content = ();
208
-
209
- # Current position
210
- my %current = ();
211
-
212
- # All pages in the document
213
- my $pages = $doc->get_objs_ref();
214
-
215
- # From page, To page
216
- my $start_page = (defined $start) ? $start->{ 'L1' } : 0;
217
- my $end_page = (defined $end) ? $end->{ 'L1' } : (scalar(@{ $pages }) - 1);
218
-
219
- # Tree traveling is 'not' fun. Seriously.
220
- # This is like a dungeon seige.
221
- for (my $x = $start_page; $x <= $end_page; $x++)
222
- {
223
- # Current position
224
- $current{ 'L1' } = $x;
225
-
226
- # Column or dd
227
- my $level_2 = $pages->[ $x ]->get_objs_ref();
228
- my $start_l2 = ((defined $start) && ($x == $start->{ 'L1' })) ?
229
- $start->{ 'L2' } : 0;
230
- my $end_l2 = ((defined $end) && ($x == $end->{ 'L1' })) ?
231
- $end->{ 'L2' } : (scalar(@{ $level_2 }) - 1);
232
-
233
- for (my $y = $start_l2; $y <= $end_l2; $y++)
234
- {
235
- # Current position
236
- $current{ 'L2' } = $y;
237
-
238
- # Table or paragraph
239
- my $level_3 = $level_2->[ $y ]->get_objs_ref();
240
- my $start_l3 = ((defined $start) && ($x == $start->{ 'L1' }) && ($y == $start->{ 'L2' })) ?
241
- $start->{ 'L3' } : 0;
242
- my $end_l3 = ((defined $end) &&($x == $end->{ 'L1' }) && ($y == $end->{ 'L2' })) ?
243
- $end->{ 'L3' } : (scalar(@{ $level_3 }) - 1);
244
-
245
- for (my $z = $start_l3; $z <= $end_l3; $z++)
246
- {
247
- # Current position
248
- $current{ 'L3' } = $z;
249
-
250
- # Is a paragraph
251
- if ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIPARA' })
252
- {
253
- # Line or cell
254
- my $level_4 = $level_3->[ $z ]->get_objs_ref();
255
- my $start_l4 = ((defined $start) && ($x == $start->{ 'L1' }) && ($y == $start->{ 'L2' }) && ($z == $start->{ 'L3' })) ?
256
- $start->{ 'L4' } : 0;
257
- my $end_l4 = ((defined $end) && ($x == $end->{ 'L1' }) && ($y == $end->{ 'L2' }) && ($z == $end->{ 'L3' })) ?
258
- $end->{ 'L4' } : (scalar(@{ $level_4 }) - 1);
259
-
260
- # Lines
261
- for (my $t = $start_l4; $t <= $end_l4; $t++)
262
- {
263
- # Current position
264
- $current{ 'L4' } = $t;
265
-
266
- # Only keep non-empty line
267
- my $l = $level_4->[ $t ]->get_content();
268
- $l =~ s/^\s+|\s+$//g;
269
-
270
- if ($l ne "")
271
- {
272
- # Save the current position and the content of the current line
273
- push @line_pos, { %current };
274
- push @line_content, $level_4->[ $t ]->get_content();
275
- }
276
- }
277
- }
278
- # Is a table or frame
279
- elsif (($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNITABLE' }) || ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIFRAME' }))
280
- {
281
- # TODO: this actually a trick to get it working for now.
282
- # We care not about the cell inside the table but the content
283
- # of the table only. So the table is consider a paragraph in
284
- # which lines are its row
285
- my @level_4 = split(/\n/, $level_3->[ $z ]->get_content());
286
-
287
- for (my $t = 0; $t <= scalar(@level_4); $t++)
288
- {
289
- # Current position
290
- $current{ 'L4' } = $t;
291
-
292
- # Only keep non-empty line
293
- my $l = $level_4[ $t ];
294
- $l =~ s/^\s+|\s+$//g;
295
-
296
- if ($l ne "")
297
- {
298
- # Save the current position and the content of the current line
299
- push @line_pos, { %current };
300
- push @line_content, $level_4[ $t ];
301
- }
302
- }
303
- }
304
-
305
-
306
- }
307
- }
308
- }
309
-
310
- return (\@line_pos, \@line_content);
311
- }
312
-
313
- 1;