biblicit 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,328 +0,0 @@
1
- package Omni::Omnidd;
2
-
3
- # Configuration
4
- use strict;
5
-
6
- # Local libraries
7
- use Omni::Config;
8
- use Omni::Omnicol;
9
- use Omni::Omnipara;
10
- use Omni::Omniframe;
11
- use Omni::Omnitable;
12
-
13
- # Extern libraries
14
- use XML::Twig;
15
- use XML::Parser;
16
-
17
- # Global variables
18
- my $tag_list = $Omni::Config::tag_list;
19
- my $att_list = $Omni::Config::att_list;
20
- my $obj_list = $Omni::Config::obj_list;
21
-
22
- # Temporary variables
23
-
24
- ###
25
- # A dd object in Omnipage xml: a dd, don't know what it is, but its structure
26
- # is quite similar to a column
27
- #
28
- # Do Hoang Nhat Huy, 11 Jan 2011
29
- ###
30
- # Initialization
31
- sub new
32
- {
33
- my ($class) = @_;
34
-
35
- # dd: a dd can have many tables, or pictures, may be paras, and columns
36
- my @objs = ();
37
-
38
- # Class members
39
- my $self = { '_self' => $obj_list->{ 'OMNIDD' },
40
- '_raw' => undef,
41
- '_content' => undef,
42
- '_bottom' => undef,
43
- '_top' => undef,
44
- '_left' => undef,
45
- '_right' => undef,
46
- '_bottom_dist' => undef,
47
- '_top_dist' => undef,
48
- '_left_dist' => undef,
49
- '_right_dist' => undef,
50
- '_objs' => \@objs };
51
-
52
- bless $self, $class;
53
- return $self;
54
- }
55
-
56
- #
57
- sub set_raw
58
- {
59
- my ($self, $raw) = @_;
60
-
61
- # Save the raw xml <column> ... </column>
62
- $self->{ '_raw' } = $raw;
63
-
64
- # Parse the raw string
65
- my $twig_roots = { $tag_list->{ 'DD' } => 1 };
66
- my $twig_handlers = { $tag_list->{ 'DD' } => sub { parse(@_, \$self); } };
67
-
68
- # XML::Twig
69
- my $twig = new XML::Twig( twig_roots => $twig_roots,
70
- twig_handlers => $twig_handlers,
71
- pretty_print => 'indented' );
72
-
73
- # Start the XML parsing
74
- $twig->parse($raw, \$self);
75
- $twig->purge;
76
- }
77
-
78
- sub get_raw
79
- {
80
- my ($self) = @_;
81
- return $self->{ '_raw' };
82
- }
83
-
84
- sub parse
85
- {
86
- my ($twig, $node, $self) = @_;
87
-
88
- # At first, content is blank
89
- my $tmp_content = "";
90
- # because there's no object
91
- my @tmp_objs = ();
92
-
93
- # Get <dd> node attributes
94
- my $tmp_bottom = GetNodeAttr($node, $att_list->{ 'BOTTOM' });
95
- my $tmp_top = GetNodeAttr($node, $att_list->{ 'TOP' });
96
- my $tmp_left = GetNodeAttr($node, $att_list->{ 'LEFT' });
97
- my $tmp_right = GetNodeAttr($node, $att_list->{ 'RIGHT' });
98
- my $tmp_bottom_dist = GetNodeAttr($node, $att_list->{ 'BOTTOMDIST' });
99
- my $tmp_top_dist = GetNodeAttr($node, $att_list->{ 'TOPDIST' });
100
- my $tmp_left_dist = GetNodeAttr($node, $att_list->{ 'LEFTDIST' });
101
- my $tmp_right_dist = GetNodeAttr($node, $att_list->{ 'RIGHTDIST' });
102
-
103
- # Check if there's any paragraph, col, table, or picture
104
- # The large number of possible children is due to the
105
- # ambiguous structure of the Omnipage XML
106
- my $dd_tag = $tag_list->{ 'DD' };
107
- my $img_tag = $tag_list->{ 'PICTURE' };
108
- my $para_tag = $tag_list->{ 'PARA' };
109
- my $table_tag = $tag_list->{ 'TABLE' };
110
- my $column_tag = $tag_list->{ 'COLUMN' };
111
- my $frame_tag = $tag_list->{ 'FRAME' };
112
-
113
- my $child = undef;
114
- # Get the first child in the body text
115
- $child = $node->first_child();
116
-
117
- while (defined $child)
118
- {
119
- my $xpath = $child->path();
120
-
121
- # if this child is a <para> tag
122
- if ($xpath =~ m/\/$para_tag$/)
123
- {
124
- my $para = new Omni::Omnipara();
125
-
126
- # Set raw content
127
- $para->set_raw($child->sprint());
128
-
129
- # Update paragraph list
130
- push @tmp_objs, $para;
131
-
132
- # Update content
133
- $tmp_content = $tmp_content . $para->get_content() . "\n";
134
- }
135
- # if this child is a <dd> tag
136
- elsif ($xpath =~ m/\/$dd_tag$/)
137
- {
138
- my $dd = new Omni::Omnidd();
139
-
140
- # Set raw content
141
- $dd->set_raw($child->sprint());
142
-
143
- # Nested <dd> is not allowed so we copy the objects
144
- my $objects = $dd->get_objs_ref();
145
-
146
- # Update <dd> objects list
147
- push @tmp_objs, @{ $objects };
148
-
149
- # Update content
150
- $tmp_content = $tmp_content . $dd->get_content() . "\n";
151
- }
152
- # if this child is a <table> tag
153
- elsif ($xpath =~ m/\/$table_tag$/)
154
- {
155
- my $table = new Omni::Omnitable();
156
-
157
- # Set raw content
158
- $table->set_raw($child->sprint());
159
-
160
- # Update paragraph list
161
- push @tmp_objs, $table;
162
-
163
- # Update content
164
- $tmp_content = $tmp_content . $table->get_content() . "\n";
165
- }
166
- # if this child is a <picture> tag
167
- elsif ($xpath =~ m/\/$img_tag$/)
168
- {
169
- #my $img = new Omni::Omniimg();
170
-
171
- # Set raw content
172
- #$img->set_raw($child->sprint());
173
-
174
- # Update paragraph list
175
- #push @tmp_objs, $img;
176
-
177
- # Update content
178
- #$tmp_content = $tmp_content . $img->get_content() . "\n";
179
- }
180
- # if this child is a <column> tag
181
- elsif ($xpath =~ m/\/$column_tag$/)
182
- {
183
- #my $col = new Omni::Omnicol();
184
-
185
- # Set raw content
186
- #$col->set_raw($child->sprint());
187
-
188
- # Update paragraph list
189
- #push @tmp_objs, $col;
190
-
191
- # Update content
192
- #$tmp_content = $tmp_content . $col->get_content() . "\n";
193
- }
194
- # if this child is <frame>
195
- elsif ($xpath =~ m/\/$frame_tag$/)
196
- {
197
- my $frame = new Omni::Omniframe();
198
-
199
- # Set raw content
200
- $frame->set_raw($child->sprint());
201
-
202
- # Update column list
203
- push @tmp_objs, $frame;
204
-
205
- # Update content
206
- $tmp_content = $tmp_content . $frame->get_content() . "\n";
207
- }
208
-
209
- # Little brother
210
- if ($child->is_last_child)
211
- {
212
- last;
213
- }
214
- else
215
- {
216
- $child = $child->next_sibling();
217
- }
218
- }
219
-
220
- # Copy information from temporary variables to class members
221
- $$self->{ '_bottom' } = $tmp_bottom;
222
- $$self->{ '_top' } = $tmp_top;
223
- $$self->{ '_left' } = $tmp_left;
224
- $$self->{ '_right' } = $tmp_right;
225
- $$self->{ '_bottom_dist' } = $tmp_bottom_dist;
226
- $$self->{ '_top_dist' } = $tmp_top_dist;
227
- $$self->{ '_left_dist' } = $tmp_left_dist;
228
- $$self->{ '_right_dist' } = $tmp_right_dist;
229
-
230
- # Copy content
231
- $$self->{ '_content' } = $tmp_content;
232
-
233
- # Copy all objects
234
- @{$$self->{ '_objs' } } = @tmp_objs;
235
- }
236
-
237
- sub get_name
238
- {
239
- my ($self) = @_;
240
- return $self->{ '_self' };
241
- }
242
-
243
- sub get_objs_ref
244
- {
245
- my ($self) = @_;
246
- return $self->{ '_objs' };
247
- }
248
-
249
- sub get_content
250
- {
251
- my ($self) = @_;
252
- return $self->{ '_content' };
253
- }
254
-
255
- sub get_bottom_pos
256
- {
257
- my ($self) = @_;
258
- return $self->{ '_bottom' };
259
- }
260
-
261
- sub get_top_pos
262
- {
263
- my ($self) = @_;
264
- return $self->{ '_top' };
265
- }
266
-
267
- sub get_left_pos
268
- {
269
- my ($self) = @_;
270
- return $self->{ '_left' };
271
- }
272
-
273
- sub get_right_pos
274
- {
275
- my ($self) = @_;
276
- return $self->{ '_right' };
277
- }
278
-
279
- sub get_bottom_distance
280
- {
281
- my ($self) = @_;
282
- return $self->{ '_bottom_dist' };
283
- }
284
-
285
- sub get_top_distance
286
- {
287
- my ($self) = @_;
288
- return $self->{ '_top_dist' };
289
- }
290
-
291
- sub get_left_distance
292
- {
293
- my ($self) = @_;
294
- return $self->{ '_left_dist' };
295
- }
296
-
297
- sub get_right_distance
298
- {
299
- my ($self) = @_;
300
- return $self->{ '_right_dist' };
301
- }
302
-
303
- # Support functions
304
- sub GetNodeAttr
305
- {
306
- my ($node, $attr) = @_;
307
- return ($node->att($attr) ? $node->att($attr) : "");
308
- }
309
-
310
- sub SetNodeAttr
311
- {
312
- my ($node, $attr, $value) = @_;
313
- $node->set_att($attr, $value);
314
- }
315
-
316
- sub GetNodeText
317
- {
318
- my ($node) = @_;
319
- return $node->text;
320
- }
321
-
322
- sub SetNodeText
323
- {
324
- my ($node, $value) = @_;
325
- $node->set_text($value);
326
- }
327
-
328
- 1;
@@ -1,153 +0,0 @@
1
- package Omni::Omnidoc;
2
-
3
- # Configuration
4
- use strict;
5
-
6
- # Local libraries
7
- use Omni::Config;
8
- use Omni::Omnipage;
9
-
10
- # Extern libraries
11
- use XML::Twig;
12
- use XML::Parser;
13
-
14
- # Global variables
15
- my $tag_list = $Omni::Config::tag_list;
16
- my $att_list = $Omni::Config::att_list;
17
- my $obj_list = $Omni::Config::obj_list;
18
-
19
- # Temporary variables
20
- my $tmp_content = undef;
21
- my @tmp_pages = ();
22
-
23
- ###
24
- # A whole document object in Omnipage xml: a document contains many pages
25
- #
26
- # Do Hoang Nhat Huy, 09 Jan 2011
27
- ###
28
- # Initialization
29
- sub new
30
- {
31
- my ($class) = @_;
32
-
33
- # Lines: a paragraph can have multiple lines
34
- my @pages = ();
35
-
36
- # Class members
37
- my $self = { '_self' => $obj_list->{ 'OMNIDOC' },
38
- '_raw' => undef,
39
- '_content' => undef,
40
- '_pages' => \@pages };
41
-
42
- bless $self, $class;
43
- return $self;
44
- }
45
-
46
- #
47
- sub set_raw
48
- {
49
- my ($self, $raw) = @_;
50
-
51
- # Save the raw xml <para> ... </para>
52
- $self->{ '_raw' } = $raw;
53
-
54
- # At first, content is blank
55
- $tmp_content = "";
56
- # because there's no document
57
- @tmp_pages = ();
58
-
59
- # Parse the raw string
60
- my $twig_roots = { $tag_list->{ 'DOCUMENT' } => 1 };
61
- my $twig_handlers = { $tag_list->{ 'DOCUMENT' } => \&parse};
62
-
63
- # XML::Twig
64
- my $twig = new XML::Twig( twig_roots => $twig_roots,
65
- twig_handlers => $twig_handlers,
66
- pretty_print => 'indented' );
67
-
68
- # Start the XML parsing
69
- $twig->parse($raw);
70
- $twig->purge;
71
-
72
- # Copy information from temporary variables to class members
73
-
74
- # Copy all pages
75
- @{$self->{ '_pages' } } = @tmp_pages;
76
-
77
- # Copy content
78
- $self->{ '_content' } = $tmp_content;
79
- }
80
-
81
- sub get_raw
82
- {
83
- my ($self) = @_;
84
- return $self->{ '_raw' };
85
- }
86
-
87
- sub parse
88
- {
89
- my ($twig, $node) = @_;
90
-
91
- # Get <document> node attributes
92
-
93
- # Check if there's any para
94
- my @all_pages = $node->descendants( $tag_list->{ 'PAGE' } );
95
- foreach my $pg (@all_pages)
96
- {
97
- my $page = new Omni::Omnipage();
98
-
99
- # Set raw content
100
- $page->set_raw($pg->sprint());
101
-
102
- # Update page list
103
- push @tmp_pages, $page;
104
-
105
- # Update content
106
- $tmp_content = $tmp_content . $page->get_content() . "\n";
107
- }
108
- }
109
-
110
- sub get_name
111
- {
112
- my ($self) = @_;
113
- return $self->{ '_self' };
114
- }
115
-
116
- sub get_objs_ref
117
- {
118
- my ($self) = @_;
119
- return $self->{ '_pages' };
120
- }
121
-
122
- sub get_content
123
- {
124
- my ($self) = @_;
125
- return $self->{ '_content' };
126
- }
127
-
128
- # Support functions
129
- sub GetNodeAttr
130
- {
131
- my ($node, $attr) = @_;
132
- return ($node->att($attr) ? $node->att($attr) : "");
133
- }
134
-
135
- sub SetNodeAttr
136
- {
137
- my ($node, $attr, $value) = @_;
138
- $node->set_att($attr, $value);
139
- }
140
-
141
- sub GetNodeText
142
- {
143
- my ($node) = @_;
144
- return $node->text;
145
- }
146
-
147
- sub SetNodeText
148
- {
149
- my ($node, $value) = @_;
150
- $node->set_text($value);
151
- }
152
-
153
- 1;
@@ -1,223 +0,0 @@
1
- package Omni::Omniframe;
2
-
3
- # Configuration
4
- use strict;
5
-
6
- # Local libraries
7
- use Omni::Config;
8
- use Omni::Omnipara;
9
- use Omni::Omnitable;
10
-
11
- # Extern libraries
12
- use XML::Twig;
13
- use XML::Parser;
14
-
15
- # Global variables
16
- my $tag_list = $Omni::Config::tag_list;
17
- my $att_list = $Omni::Config::att_list;
18
- my $obj_list = $Omni::Config::obj_list;
19
-
20
- ###
21
- # A frame object in Omnipage xml: a frame contains paragraphs
22
- # (this is my observation and can be invalid under close scrunity
23
- # of new "evidence")
24
- #
25
- # Do Hoang Nhat Huy, 23 Feb 2011
26
- ###
27
- # Initialization
28
- sub new
29
- {
30
- my ($class) = @_;
31
-
32
- # Objs: paragraphs
33
- my @objs = ();
34
-
35
- # Class members
36
- my $self = { '_self' => $obj_list->{ 'OMNIFRAME' },
37
- '_raw' => undef,
38
- '_content' => undef,
39
- '_bottom' => undef,
40
- '_top' => undef,
41
- '_left' => undef,
42
- '_right' => undef,
43
- '_objs' => \@objs };
44
-
45
- bless $self, $class;
46
- return $self;
47
- }
48
-
49
- sub set_raw
50
- {
51
- my ($self, $raw) = @_;
52
-
53
- # Save the raw xml <column> ... </column>
54
- $self->{ '_raw' } = $raw;
55
-
56
- # Parse the raw string
57
- my $twig_roots = { $tag_list->{ 'FRAME' } => 1 };
58
- my $twig_handlers = { $tag_list->{ 'FRAME' } => sub { parse(@_, \$self); } };
59
-
60
- # XML::Twig
61
- my $twig = new XML::Twig( twig_roots => $twig_roots,
62
- twig_handlers => $twig_handlers,
63
- pretty_print => 'indented' );
64
-
65
- # Start the XML parsing
66
- $twig->parse($raw, \$self);
67
- $twig->purge;
68
- }
69
-
70
- sub get_raw
71
- {
72
- my ($self) = @_;
73
- return $self->{ '_raw' };
74
- }
75
-
76
- sub parse
77
- {
78
- my ($twig, $node, $self) = @_;
79
-
80
- # At first, content is blank
81
- my $tmp_content = "";
82
- # because there's no object
83
- my @tmp_objs = ();
84
-
85
- # Get <column> node attributes
86
- my $tmp_bottom = GetNodeAttr($node, $att_list->{ 'BOTTOM' });
87
- my $tmp_top = GetNodeAttr($node, $att_list->{ 'TOP' });
88
- my $tmp_left = GetNodeAttr($node, $att_list->{ 'LEFT' });
89
- my $tmp_right = GetNodeAttr($node, $att_list->{ 'RIGHT' });
90
-
91
- # Check if there's any paragraph, dd, table, or picture
92
- # The large number of possible children is due to the
93
- # ambiguous structure of the Omnipage XML
94
- my $para_tag = $tag_list->{ 'PARA' };
95
- my $table_tag = $tag_list->{ 'TABLE' };
96
-
97
- # Get the first child in the body text
98
- my $child = $node->first_child();
99
-
100
- while (defined $child)
101
- {
102
- my $xpath = $child->path();
103
-
104
- # if this child is <para>
105
- if ($xpath =~ m/\/$para_tag$/)
106
- {
107
- my $para = new Omni::Omnipara();
108
-
109
- # Set raw content
110
- $para->set_raw($child->sprint());
111
-
112
- # Update paragraph list
113
- push @tmp_objs, $para;
114
-
115
- # Update content
116
- $tmp_content = $tmp_content . $para->get_content() . "\n";
117
- }
118
- elsif ($xpath =~ m/\/$table_tag$/)
119
- {
120
- my $table = new Omni::Omnitable();
121
-
122
- # Set raw content
123
- $table->set_raw($child->sprint());
124
-
125
- # Update paragraph list
126
- push @tmp_objs, $table;
127
-
128
- # Update content
129
- $tmp_content = $tmp_content . $table->get_content() . "\n";
130
- }
131
-
132
- # Little brother
133
- if ($child->is_last_child)
134
- {
135
- last;
136
- }
137
- else
138
- {
139
- $child = $child->next_sibling();
140
- }
141
- }
142
-
143
- # Copy information from temporary variables to class members
144
- $$self->{ '_bottom' } = $tmp_bottom;
145
- $$self->{ '_top' } = $tmp_top;
146
- $$self->{ '_left' } = $tmp_left;
147
- $$self->{ '_right' } = $tmp_right;
148
-
149
- # Copy all objects
150
- @{$$self->{ '_objs' } } = @tmp_objs;
151
-
152
- # Copy content
153
- $$self->{ '_content' } = $tmp_content;
154
- }
155
-
156
- sub get_name
157
- {
158
- my ($self) = @_;
159
- return $self->{ '_self' };
160
- }
161
-
162
- sub get_objs_ref
163
- {
164
- my ($self) = @_;
165
- return $self->{ '_objs' };
166
- }
167
-
168
- sub get_content
169
- {
170
- my ($self) = @_;
171
- return $self->{ '_content' };
172
- }
173
-
174
- sub get_bottom_pos
175
- {
176
- my ($self) = @_;
177
- return $self->{ '_bottom' };
178
- }
179
-
180
- sub get_top_pos
181
- {
182
- my ($self) = @_;
183
- return $self->{ '_top' };
184
- }
185
-
186
- sub get_left_pos
187
- {
188
- my ($self) = @_;
189
- return $self->{ '_left' };
190
- }
191
-
192
- sub get_right_pos
193
- {
194
- my ($self) = @_;
195
- return $self->{ '_right' };
196
- }
197
-
198
- # Support functions
199
- sub GetNodeAttr
200
- {
201
- my ($node, $attr) = @_;
202
- return ($node->att($attr) ? $node->att($attr) : "");
203
- }
204
-
205
- sub SetNodeAttr
206
- {
207
- my ($node, $attr, $value) = @_;
208
- $node->set_att($attr, $value);
209
- }
210
-
211
- sub GetNodeText
212
- {
213
- my ($node) = @_;
214
- return $node->text;
215
- }
216
-
217
- sub SetNodeText
218
- {
219
- my ($node, $value) = @_;
220
- $node->set_text($value);
221
- }
222
-
223
- 1;