biblicit 2.0.5 → 2.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/lib/biblicit/version.rb +1 -1
- data/parscit/INSTALL +29 -7
- data/parscit/bin/parsHed/redo.parsHed.pl +3 -2
- data/parscit/bin/redo.parsCit.pl +3 -2
- data/parscit/bin/sectLabel/genericSect/crossValidation.rb +1 -1
- data/parscit/bin/sectLabel/genericSectExtract.rb +1 -1
- data/parscit/bin/sectLabel/redo.sectLabel.pl +3 -2
- data/parscit/doc/index.html +692 -0
- data/parscit/lib/ParsCit/Config.pm +1 -1
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1 -1
- data/parscit/lib/ParsHed/Config.pm +1 -1
- data/parscit/lib/ParsHed/Tr2crfpp.pm +1 -1
- data/parscit/lib/ParsHed/Tr2crfpp_token.pm +1 -1
- data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
- data/parscit/lib/SectLabel/Config.pm +1 -1
- data/parscit/lib/SectLabel/Tr2crfpp.pm +2 -2
- metadata +5 -8
@@ -0,0 +1,1949 @@
|
|
1
|
+
package SectLabel::AAMatching;
|
2
|
+
|
3
|
+
###
|
4
|
+
# This package provides methods to solve the matching problem
|
5
|
+
# between author and affiliation in a pdf
|
6
|
+
#
|
7
|
+
# Do Hoang Nhat Huy 21 Apr, 11
|
8
|
+
###
|
9
|
+
|
10
|
+
use strict;
|
11
|
+
|
12
|
+
# Dependencies
|
13
|
+
use POSIX;
|
14
|
+
use IO::File;
|
15
|
+
use XML::Writer;
|
16
|
+
use XML::Writer::String;
|
17
|
+
|
18
|
+
use Class::Struct;
|
19
|
+
|
20
|
+
# Local libraries
|
21
|
+
use SectLabel::Config;
|
22
|
+
use ParsCit::PostProcess;
|
23
|
+
|
24
|
+
# Dictionary
|
25
|
+
my %dict = ();
|
26
|
+
# CRF++
|
27
|
+
my $crft = $ENV{'CRFPP_HOME'} ? "$ENV{'CRFPP_HOME'}/bin/crf_test" : "$FindBin::Bin/../$SectLabel::Config::crf_test";
|
28
|
+
|
29
|
+
# Matching features of each author, including
|
30
|
+
# Signals
|
31
|
+
# Coordinations: top, bottom, left, right
|
32
|
+
# Position: page, sections, paragraph, line
|
33
|
+
struct aut_rcfeatures =>
|
34
|
+
{
|
35
|
+
signals => '@',
|
36
|
+
|
37
|
+
top => '$',
|
38
|
+
bottom => '$',
|
39
|
+
left => '$',
|
40
|
+
right => '$',
|
41
|
+
|
42
|
+
page => '$',
|
43
|
+
section => '$',
|
44
|
+
para => '$',
|
45
|
+
line => '$'
|
46
|
+
};
|
47
|
+
|
48
|
+
# Matching features of each affiliation, including
|
49
|
+
# Signals
|
50
|
+
# Coordinations: top, bottom, left, right
|
51
|
+
# Position: page, sections, paragraph, line
|
52
|
+
struct aff_rcfeatures =>
|
53
|
+
{
|
54
|
+
signals => '@',
|
55
|
+
|
56
|
+
top => '$',
|
57
|
+
bottom => '$',
|
58
|
+
left => '$',
|
59
|
+
right => '$',
|
60
|
+
|
61
|
+
page => '$',
|
62
|
+
section => '$',
|
63
|
+
para => '$',
|
64
|
+
line => '$'
|
65
|
+
};
|
66
|
+
|
67
|
+
# Author
|
68
|
+
# Affiliation
|
69
|
+
sub AAMatching
|
70
|
+
{
|
71
|
+
my ($doc, $aut_addrs, $aff_addrs) = @_;
|
72
|
+
|
73
|
+
my $need_object = 1;
|
74
|
+
# Get the author objects
|
75
|
+
my $aut_lines = Omni::Traversal::OmniCollector($doc, $aut_addrs, $need_object);
|
76
|
+
# Get the affiliation objects
|
77
|
+
my $aff_lines = Omni::Traversal::OmniCollector($doc, $aff_addrs, $need_object);
|
78
|
+
|
79
|
+
# Dictionary
|
80
|
+
ReadDict($FindBin::Bin . "/../" . $SectLabel::Config::dictFile);
|
81
|
+
|
82
|
+
# Authors
|
83
|
+
my ($aut_features, $aut_rc_features) = AuthorFeatureExtraction($aut_lines, $aut_addrs);
|
84
|
+
# Call CRF
|
85
|
+
my ($aut_signal, $aut_rc) = AuthorExtraction($aut_features, $aut_rc_features);
|
86
|
+
|
87
|
+
# Affiliations
|
88
|
+
my ($aff_features, $aff_rc_features) = AffiliationFeatureExtraction($aff_lines, $aff_addrs);
|
89
|
+
# Call CRF
|
90
|
+
my ($aff_signal, $aff_rc, $affs) = AffiliationExtraction($aff_features, $aff_rc_features);
|
91
|
+
|
92
|
+
# Matching features
|
93
|
+
my $aa_features = AAFeatureExtraction($aut_rc, $aff_rc);
|
94
|
+
# Matching
|
95
|
+
my $aa = AAMatchingImp($aa_features);
|
96
|
+
|
97
|
+
=pod
|
98
|
+
# DEBUG
|
99
|
+
my $aut_handle = undef;
|
100
|
+
my $aff_handle = undef;
|
101
|
+
my $aau_handle = undef;
|
102
|
+
my $aaf_handle = undef;
|
103
|
+
my $aut_debug = undef;
|
104
|
+
my $aff_debug = undef;
|
105
|
+
my $aa_handle = undef;
|
106
|
+
|
107
|
+
open $aut_handle, ">:utf8", "aut.features";
|
108
|
+
open $aff_handle, ">:utf8", "aff.features";
|
109
|
+
open $aau_handle, ">:utf8", "aau.features";
|
110
|
+
open $aaf_handle, ">:utf8", "aaf.features";
|
111
|
+
open $aut_debug, ">:utf8", "aut.debug.features";
|
112
|
+
open $aff_debug, ">:utf8", "aff.debug.features";
|
113
|
+
open $aa_handle, ">:utf8", "aa.features";
|
114
|
+
|
115
|
+
print $aut_handle $aut_features;
|
116
|
+
print $aff_handle $aff_features;
|
117
|
+
print $aau_handle $aut_rc_features;
|
118
|
+
print $aaf_handle $aff_rc_features;
|
119
|
+
print $aa_handle $aa_features, "\n";
|
120
|
+
|
121
|
+
foreach my $author (keys %{ $aut_rc } )
|
122
|
+
{
|
123
|
+
print $aut_debug $author, ": ", "\n";
|
124
|
+
|
125
|
+
foreach my $signal (@{ $aut_rc->{ $author }->signals })
|
126
|
+
{
|
127
|
+
print $aut_debug "\t", $signal, "\n";
|
128
|
+
}
|
129
|
+
|
130
|
+
print $aut_debug "\t", $aut_rc->{ $author }->top, "\n";
|
131
|
+
print $aut_debug "\t", $aut_rc->{ $author }->bottom, "\n";
|
132
|
+
print $aut_debug "\t", $aut_rc->{ $author }->left, "\n";
|
133
|
+
print $aut_debug "\t", $aut_rc->{ $author }->right, "\n";
|
134
|
+
|
135
|
+
print $aut_debug "\t", $aut_rc->{ $author }->page, "\n";
|
136
|
+
print $aut_debug "\t", $aut_rc->{ $author }->section, "\n";
|
137
|
+
print $aut_debug "\t", $aut_rc->{ $author }->para, "\n";
|
138
|
+
print $aut_debug "\t", $aut_rc->{ $author }->line, "\n";
|
139
|
+
}
|
140
|
+
|
141
|
+
foreach my $affiliation (keys %{ $aff_rc } )
|
142
|
+
{
|
143
|
+
print $aff_debug $affiliation, ": ", "\n";
|
144
|
+
|
145
|
+
foreach my $signal (@{ $aff_rc->{ $affiliation }->signals })
|
146
|
+
{
|
147
|
+
print $aff_debug "\t", $signal, "\n";
|
148
|
+
}
|
149
|
+
|
150
|
+
print $aff_debug "\t", $aff_rc->{ $affiliation }->top, "\n";
|
151
|
+
print $aff_debug "\t", $aff_rc->{ $affiliation }->bottom, "\n";
|
152
|
+
print $aff_debug "\t", $aff_rc->{ $affiliation }->left, "\n";
|
153
|
+
print $aff_debug "\t", $aff_rc->{ $affiliation }->right, "\n";
|
154
|
+
|
155
|
+
print $aff_debug "\t", $aff_rc->{ $affiliation }->page, "\n";
|
156
|
+
print $aff_debug "\t", $aff_rc->{ $affiliation }->section, "\n";
|
157
|
+
print $aff_debug "\t", $aff_rc->{ $affiliation }->para, "\n";
|
158
|
+
print $aff_debug "\t", $aff_rc->{ $affiliation }->line, "\n";
|
159
|
+
}
|
160
|
+
|
161
|
+
close $aut_handle;
|
162
|
+
close $aff_handle;
|
163
|
+
close $aau_handle;
|
164
|
+
close $aaf_handle;
|
165
|
+
close $aut_debug;
|
166
|
+
close $aff_debug;
|
167
|
+
close $aa_handle;
|
168
|
+
# END
|
169
|
+
=cut
|
170
|
+
|
171
|
+
# Do the matching
|
172
|
+
# XML string
|
173
|
+
my $sxml = "";
|
174
|
+
# and XML writer
|
175
|
+
my $writer = new XML::Writer(OUTPUT => \$sxml, ENCODING => 'utf-8', DATA_MODE => 'true', DATA_INDENT => 2);
|
176
|
+
|
177
|
+
# Algorithm
|
178
|
+
$writer->startTag("algorithm", "name" => "AAMatching", "version" => $SectLabel::Config::algorithmVersion);
|
179
|
+
|
180
|
+
# XML header
|
181
|
+
my $date = `date`; chomp($date);
|
182
|
+
my $time = `date +%s`; chomp($time);
|
183
|
+
# Write XML header
|
184
|
+
$writer->startTag("results", "time" => $time, "date" => $date);
|
185
|
+
|
186
|
+
# Write authors
|
187
|
+
$writer->startTag("authors");
|
188
|
+
|
189
|
+
# Write the author name and his corresponding institution
|
190
|
+
foreach my $author (keys %{ $aut_signal })
|
191
|
+
{
|
192
|
+
$writer->startTag("author");
|
193
|
+
|
194
|
+
$writer->startTag("fullname", "source" => "parscit");
|
195
|
+
$writer->characters($author);
|
196
|
+
$writer->endTag("fullname");
|
197
|
+
|
198
|
+
$writer->startTag("institutions");
|
199
|
+
=pod
|
200
|
+
foreach my $signal (@{ $aut_signal->{ $author } })
|
201
|
+
{
|
202
|
+
$signal =~ s/^\s+|\s+$//g;
|
203
|
+
# Skip blank
|
204
|
+
if ($signal eq "") { next; }
|
205
|
+
|
206
|
+
$writer->startTag("institution", "symbol" => $signal);
|
207
|
+
$writer->characters($aff_signal->{ $signal });
|
208
|
+
$writer->endTag("institution");
|
209
|
+
}
|
210
|
+
=cut
|
211
|
+
|
212
|
+
foreach my $affiliation (@{ $aa->{ $author } })
|
213
|
+
{
|
214
|
+
$writer->startTag("institution");
|
215
|
+
$writer->characters($affiliation);
|
216
|
+
$writer->endTag("institution");
|
217
|
+
}
|
218
|
+
|
219
|
+
$writer->endTag("institutions");
|
220
|
+
|
221
|
+
$writer->endTag("author");
|
222
|
+
}
|
223
|
+
|
224
|
+
# Finish authors
|
225
|
+
$writer->endTag("authors");
|
226
|
+
|
227
|
+
# Write institutions
|
228
|
+
$writer->startTag("institutions");
|
229
|
+
|
230
|
+
# Write the instituion name
|
231
|
+
foreach my $institute (@{ $affs })
|
232
|
+
{
|
233
|
+
$writer->startTag("institution");
|
234
|
+
$writer->characters($institute);
|
235
|
+
$writer->endTag("institution");
|
236
|
+
}
|
237
|
+
|
238
|
+
$writer->endTag("institutions");
|
239
|
+
|
240
|
+
# Done
|
241
|
+
$writer->endTag("results");
|
242
|
+
# Done
|
243
|
+
$writer->endTag("algorithm");
|
244
|
+
# Done
|
245
|
+
$writer->end();
|
246
|
+
|
247
|
+
# Return the xml content back to the caller
|
248
|
+
return $sxml;
|
249
|
+
}
|
250
|
+
|
251
|
+
# Features of the relational classifier between author and affiliation
|
252
|
+
sub AAFeatureExtraction
|
253
|
+
{
|
254
|
+
my ($aut_rc, $aff_rc) = @_;
|
255
|
+
|
256
|
+
# Relational features
|
257
|
+
my $features = "";
|
258
|
+
|
259
|
+
# Features between x authors
|
260
|
+
foreach my $author (keys %{ $aut_rc })
|
261
|
+
{
|
262
|
+
my @aut_tokens = split /\s/, $author;
|
263
|
+
my $author_nb = join '|||', @aut_tokens;
|
264
|
+
|
265
|
+
my $min_aff_x = undef;
|
266
|
+
my $min_dist_x = LONG_MAX;
|
267
|
+
my $min_aff_y = undef;
|
268
|
+
my $min_dist_y = LONG_MAX;
|
269
|
+
# Find the nearest affiliation
|
270
|
+
foreach my $aff (keys %{ $aff_rc })
|
271
|
+
{
|
272
|
+
my $aut_x = ($aut_rc->{ $author }->left + $aut_rc->{ $author }->right) / 2;
|
273
|
+
my $aut_y = ($aut_rc->{ $author }->top + $aut_rc->{ $author }->bottom) / 2;
|
274
|
+
|
275
|
+
my $aff_x = ($aff_rc->{ $aff }->left + $aff_rc->{ $aff }->right) / 2;
|
276
|
+
my $aff_y = ($aff_rc->{ $aff }->top + $aff_rc->{ $aff }->bottom) / 2;
|
277
|
+
|
278
|
+
my $dis_x = abs( $aut_x - $aff_x );
|
279
|
+
my $dis_y = abs( $aut_y - $aff_y );
|
280
|
+
# Distance between an author and an affiliation
|
281
|
+
# my $distance = sqrt( $dis_x * $dis_x + $dis_y * $dis_y );
|
282
|
+
|
283
|
+
# Check if it it the minimum distance in x axis
|
284
|
+
if ($dis_x < $min_dist_x)
|
285
|
+
{
|
286
|
+
$min_dist_x = $dis_x;
|
287
|
+
$min_aff_x = $aff;
|
288
|
+
}
|
289
|
+
|
290
|
+
# Check if it it the minimum distance in y axis
|
291
|
+
if ($dis_y < $min_dist_y)
|
292
|
+
{
|
293
|
+
$min_dist_y = $dis_y;
|
294
|
+
$min_aff_y = $aff;
|
295
|
+
}
|
296
|
+
}
|
297
|
+
|
298
|
+
# and y affiliation
|
299
|
+
foreach my $aff (keys %{ $aff_rc })
|
300
|
+
{
|
301
|
+
my @aff_tokens = split /\s/, $aff;
|
302
|
+
my $aff_nb = join '|||', @aff_tokens;
|
303
|
+
|
304
|
+
# Content
|
305
|
+
$features .= $author_nb . "#" . $aff_nb . "\t";
|
306
|
+
|
307
|
+
my $signal = undef;
|
308
|
+
# Signal
|
309
|
+
if ((scalar(@{ $aut_rc->{ $author }->signals }) == 0) || (scalar(@{ $aff_rc->{ $aff }->signals }) == 0))
|
310
|
+
{
|
311
|
+
$signal = "diff";
|
312
|
+
}
|
313
|
+
else
|
314
|
+
{
|
315
|
+
my $matched = undef;
|
316
|
+
# Check each author signal
|
317
|
+
foreach my $aut_sig (@{ $aut_rc->{ $author }->signals })
|
318
|
+
{
|
319
|
+
# if it match with affiliation signal
|
320
|
+
if ($aut_sig eq ${ $aff_rc->{ $aff }->signals }[ 0 ]) { $matched = 1; last; }
|
321
|
+
}
|
322
|
+
|
323
|
+
$signal = (! defined $matched) ? "diff" : "same";
|
324
|
+
}
|
325
|
+
|
326
|
+
# Signal
|
327
|
+
$features .= $signal . "\t";
|
328
|
+
|
329
|
+
# Same page
|
330
|
+
my $page = ($aut_rc->{ $author }->page == $aff_rc->{ $aff }->page) ? "yes" : "no";
|
331
|
+
$features .= $page . "\t";
|
332
|
+
|
333
|
+
my $section = undef;
|
334
|
+
# Same section
|
335
|
+
if ($page eq "yes")
|
336
|
+
{
|
337
|
+
$section = ($aut_rc->{ $author }->section == $aff_rc->{ $aff }->section) ? "yes" : "no";
|
338
|
+
$features .= $section . "\t";
|
339
|
+
}
|
340
|
+
else
|
341
|
+
{
|
342
|
+
$section = "no";
|
343
|
+
$features .= $section . "\t";
|
344
|
+
}
|
345
|
+
|
346
|
+
my $para = undef;
|
347
|
+
# Same paragraph
|
348
|
+
if (($page eq "yes") && ($section eq "yes"))
|
349
|
+
{
|
350
|
+
$para = ($aut_rc->{ $author }->para == $aff_rc->{ $aff }->para) ? "yes" : "no";
|
351
|
+
$features .= $para . "\t";
|
352
|
+
}
|
353
|
+
else
|
354
|
+
{
|
355
|
+
$para = "no";
|
356
|
+
$features .= $para . "\t";
|
357
|
+
}
|
358
|
+
|
359
|
+
my $line = undef;
|
360
|
+
# Same line
|
361
|
+
if (($page eq "yes") && ($section eq "yes") && ($para eq "yes"))
|
362
|
+
{
|
363
|
+
$line = ($aut_rc->{ $author }->line == $aff_rc->{ $aff }->line) ? "yes" : "no";
|
364
|
+
$features .= $line . "\t";
|
365
|
+
}
|
366
|
+
else
|
367
|
+
{
|
368
|
+
$line = "no";
|
369
|
+
$features .= $line . "\t";
|
370
|
+
}
|
371
|
+
|
372
|
+
# Is neartest affiliation in x axis ?
|
373
|
+
my $nearest_x = ($aff eq $min_aff_x) ? "yes" : "no";
|
374
|
+
$features .= $nearest_x . "\t";
|
375
|
+
|
376
|
+
# Is neartest affiliation in y axis ?
|
377
|
+
my $nearest_y = ($aff eq $min_aff_y) ? "yes" : "no";
|
378
|
+
$features .= $nearest_y . "\n";
|
379
|
+
}
|
380
|
+
}
|
381
|
+
|
382
|
+
return $features;
|
383
|
+
}
|
384
|
+
|
385
|
+
# Actually do the matching between author and affiliation
|
386
|
+
sub AAMatchingImp
|
387
|
+
{
|
388
|
+
my ($features) = @_;
|
389
|
+
|
390
|
+
# Temporary input file for CRF
|
391
|
+
my $infile = BuildTmpFile("aa-input");
|
392
|
+
# Temporary output file for CRF
|
393
|
+
my $outfile = BuildTmpFile("aa-output");
|
394
|
+
|
395
|
+
my $output_handle = undef;
|
396
|
+
# Split and write to temporary input
|
397
|
+
open $output_handle, ">:utf8", $infile;
|
398
|
+
# Split
|
399
|
+
my @lines = split /\n/, $features;
|
400
|
+
# and write
|
401
|
+
foreach my $line (@lines)
|
402
|
+
{
|
403
|
+
if ($line eq "")
|
404
|
+
{
|
405
|
+
print $output_handle "\n";
|
406
|
+
}
|
407
|
+
else
|
408
|
+
{
|
409
|
+
print $output_handle $line, "\t", "no", "\n";
|
410
|
+
}
|
411
|
+
}
|
412
|
+
# Done
|
413
|
+
close $output_handle;
|
414
|
+
|
415
|
+
# AA matching model
|
416
|
+
my $match_model = $SectLabel::Config::matFile;
|
417
|
+
|
418
|
+
# Matching
|
419
|
+
system("$crft -m $match_model $infile > $outfile");
|
420
|
+
|
421
|
+
# List of authors and their affiliation (if exists)
|
422
|
+
my %aa = ();
|
423
|
+
|
424
|
+
my $input_handle = undef;
|
425
|
+
# Read the CRF output
|
426
|
+
open $input_handle, "<:utf8", $outfile;
|
427
|
+
# Read each line and get its label
|
428
|
+
while (<$input_handle>)
|
429
|
+
{
|
430
|
+
my $line = $_;
|
431
|
+
# Trim
|
432
|
+
$line =~ s/^\s+|\s+$//g;
|
433
|
+
# Blank linem, what the heck ?
|
434
|
+
if ($line eq "") { next; }
|
435
|
+
|
436
|
+
# Split the line
|
437
|
+
my @fields = split /\t/, $line;
|
438
|
+
# and extract the class and the content
|
439
|
+
my $class = $fields[ -1 ];
|
440
|
+
my $content = $fields[ 0 ];
|
441
|
+
|
442
|
+
# You miss
|
443
|
+
if ($class ne "yes") { next; }
|
444
|
+
|
445
|
+
# Split the content into author name and affiliation name
|
446
|
+
my @tmp = split /#/, $content;
|
447
|
+
# Author name
|
448
|
+
my $author = $tmp[ 0 ];
|
449
|
+
$author =~ s/\|\|\|/ /g;
|
450
|
+
# Affiliation name
|
451
|
+
my $aff = $tmp[ 1 ];
|
452
|
+
$aff =~ s/\|\|\|/ /g;
|
453
|
+
|
454
|
+
# Save
|
455
|
+
if (! exists $aa{ $author }) { $aa{ $author } = (); }
|
456
|
+
# Save
|
457
|
+
push @{ $aa{ $author } }, $aff;
|
458
|
+
}
|
459
|
+
|
460
|
+
# Done
|
461
|
+
close $input_handle;
|
462
|
+
|
463
|
+
# Clean up
|
464
|
+
unlink $infile;
|
465
|
+
unlink $outfile;
|
466
|
+
# Done
|
467
|
+
return (\%aa);
|
468
|
+
}
|
469
|
+
|
470
|
+
# Extract affiliation and their signal using crf
|
471
|
+
sub AffiliationExtraction
|
472
|
+
{
|
473
|
+
my ($features, $rc_features) = @_;
|
474
|
+
|
475
|
+
# Temporary input file for CRF
|
476
|
+
my $infile = BuildTmpFile("aff-input");
|
477
|
+
# Temporary output file for CRF
|
478
|
+
my $outfile = BuildTmpFile("aff-output");
|
479
|
+
|
480
|
+
my $output_handle = undef;
|
481
|
+
# Split and write to temporary input
|
482
|
+
open $output_handle, ">:utf8", $infile;
|
483
|
+
# Split
|
484
|
+
my @lines = split /\n/, $features;
|
485
|
+
# and write
|
486
|
+
foreach my $line (@lines)
|
487
|
+
{
|
488
|
+
if ($line eq "")
|
489
|
+
{
|
490
|
+
print $output_handle "\n";
|
491
|
+
}
|
492
|
+
else
|
493
|
+
{
|
494
|
+
print $output_handle $line, "\t", "affiliation", "\n";
|
495
|
+
}
|
496
|
+
}
|
497
|
+
# Done
|
498
|
+
close $output_handle;
|
499
|
+
|
500
|
+
# Author model
|
501
|
+
my $aff_model = $SectLabel::Config::affFile;
|
502
|
+
|
503
|
+
# Split the authors
|
504
|
+
system("$crft -m $aff_model $infile > $outfile");
|
505
|
+
|
506
|
+
# Each affiliation can have only one signal
|
507
|
+
my %asg = ();
|
508
|
+
# Each affilitiaon can have only one struct
|
509
|
+
my %aaf = ();
|
510
|
+
# List of all affiliations
|
511
|
+
my @aff = ();
|
512
|
+
|
513
|
+
# Each line in the relational features string
|
514
|
+
my @rc_lines = split /\n/, $rc_features;
|
515
|
+
|
516
|
+
my $input_handle = undef;
|
517
|
+
# Read the CRF output
|
518
|
+
open $input_handle, "<:utf8", $outfile;
|
519
|
+
# Author and signal string
|
520
|
+
my $prev_class = "";
|
521
|
+
my @aff_str = ();
|
522
|
+
my $signal_str = "";
|
523
|
+
# Relational classifier
|
524
|
+
my @aaf_rc = ();
|
525
|
+
# Line counter
|
526
|
+
my $counter = 0;
|
527
|
+
# Next to last signal
|
528
|
+
my $ntl_signal = "";
|
529
|
+
# Read each line and get its label
|
530
|
+
# TODO: The code assumes that an affiliation will have the following format: 1 foobar institute
|
531
|
+
while (<$input_handle>)
|
532
|
+
{
|
533
|
+
my $line = $_;
|
534
|
+
# Trim
|
535
|
+
$line =~ s/^\s+|\s+$//g;
|
536
|
+
# Blank line mark the end of an affiliation section
|
537
|
+
if ($line eq "")
|
538
|
+
{
|
539
|
+
if ($prev_class eq "affiliation")
|
540
|
+
{
|
541
|
+
my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
|
542
|
+
# Save the affiliation
|
543
|
+
push @aff, $affiliation;
|
544
|
+
# and its signal
|
545
|
+
if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }
|
546
|
+
|
547
|
+
# Save the signal
|
548
|
+
push @{ $rcs->signals }, $ntl_signal;
|
549
|
+
# Save the record
|
550
|
+
$aaf{ $affiliation } = $rcs;
|
551
|
+
}
|
552
|
+
elsif ($prev_class eq "signal")
|
553
|
+
{
|
554
|
+
# Save the next to last signal
|
555
|
+
$ntl_signal = NormalizeAffiliationSignal($signal_str);
|
556
|
+
}
|
557
|
+
|
558
|
+
# Cleanup
|
559
|
+
$ntl_signal = "";
|
560
|
+
# Cleanup
|
561
|
+
@aff_str = ();
|
562
|
+
$signal_str = "";
|
563
|
+
$prev_class = "";
|
564
|
+
# Cleanup
|
565
|
+
@aaf_rc = ();
|
566
|
+
|
567
|
+
# Update the counter
|
568
|
+
$counter++;
|
569
|
+
|
570
|
+
next ;
|
571
|
+
}
|
572
|
+
|
573
|
+
# Split the line
|
574
|
+
my @fields = split /\t/, $line;
|
575
|
+
# and extract the class and the content
|
576
|
+
my $class = $fields[ -1 ];
|
577
|
+
my $content = $fields[ 0 ];
|
578
|
+
|
579
|
+
if ($class eq $prev_class)
|
580
|
+
{
|
581
|
+
# An affiliation
|
582
|
+
if ($class eq "affiliation")
|
583
|
+
{
|
584
|
+
push @aff_str, $content;
|
585
|
+
push @aaf_rc, $rc_lines[ $counter ];
|
586
|
+
}
|
587
|
+
# A signal
|
588
|
+
elsif ($class eq "signal")
|
589
|
+
{
|
590
|
+
$signal_str .= $content . " ";
|
591
|
+
}
|
592
|
+
}
|
593
|
+
else
|
594
|
+
{
|
595
|
+
if ($prev_class eq "affiliation")
|
596
|
+
{
|
597
|
+
my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
|
598
|
+
# Save the affiliation
|
599
|
+
push @aff, $affiliation;
|
600
|
+
# and its signal
|
601
|
+
if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }
|
602
|
+
|
603
|
+
# Save the signal
|
604
|
+
push @{ $rcs->signals }, $ntl_signal;
|
605
|
+
# Save the record
|
606
|
+
$aaf{ $affiliation } = $rcs;
|
607
|
+
|
608
|
+
}
|
609
|
+
elsif ($prev_class eq "signal")
|
610
|
+
{
|
611
|
+
# Save the next to last signal
|
612
|
+
$ntl_signal = NormalizeAffiliationSignal($signal_str);
|
613
|
+
}
|
614
|
+
|
615
|
+
# Cleanup
|
616
|
+
@aff_str = ();
|
617
|
+
$signal_str = "";
|
618
|
+
@aaf_rc = ();
|
619
|
+
# Switch to the current class
|
620
|
+
$prev_class = $class;
|
621
|
+
|
622
|
+
if ($class eq "affiliation")
|
623
|
+
{
|
624
|
+
push @aff_str, $content;
|
625
|
+
push @aaf_rc, $rc_lines[ $counter ];
|
626
|
+
}
|
627
|
+
elsif ($class eq "signal")
|
628
|
+
{
|
629
|
+
$signal_str .= $content . " ";
|
630
|
+
}
|
631
|
+
}
|
632
|
+
|
633
|
+
# Update the counter
|
634
|
+
$counter++;
|
635
|
+
}
|
636
|
+
|
637
|
+
# Final class
|
638
|
+
if ($prev_class eq "affiliation")
|
639
|
+
{
|
640
|
+
my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
|
641
|
+
# Save the affiliation
|
642
|
+
push @aff, $affiliation;
|
643
|
+
# and its signal
|
644
|
+
if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }
|
645
|
+
|
646
|
+
# Save the signal
|
647
|
+
push @{ $rcs->signals }, $ntl_signal;
|
648
|
+
# Save the record
|
649
|
+
$aaf{ $affiliation } = $rcs;
|
650
|
+
}
|
651
|
+
elsif ($prev_class eq "signal")
|
652
|
+
{
|
653
|
+
# Save the next to last signal
|
654
|
+
$ntl_signal = NormalizeAffiliationSignal($signal_str);
|
655
|
+
}
|
656
|
+
|
657
|
+
# Done
|
658
|
+
close $input_handle;
|
659
|
+
|
660
|
+
# Clean up
|
661
|
+
unlink $infile;
|
662
|
+
unlink $outfile;
|
663
|
+
# Done
|
664
|
+
return (\%asg, \%aaf, \@aff);
|
665
|
+
}
|
666
|
+
|
667
|
+
sub NormalizeAffiliationSignal
|
668
|
+
{
|
669
|
+
my ($signal_str) = @_;
|
670
|
+
|
671
|
+
# Trim
|
672
|
+
$signal_str =~ s/^\s+|\s+$//g;
|
673
|
+
# Remove all space inside the signature
|
674
|
+
$signal_str =~ s/\s+//g;
|
675
|
+
|
676
|
+
# Done
|
677
|
+
return $signal_str;
|
678
|
+
}
|
679
|
+
|
680
|
+
sub NormalizeAffiliationName
|
681
|
+
{
|
682
|
+
my ($aff_str, $aaf_rc) = @_;
|
683
|
+
|
684
|
+
# Constraint
|
685
|
+
if (scalar(@{ $aff_str }) != scalar(@{ $aaf_rc })) { print STDERR "# It cannot happen, if you encounter it, please consider report it as a bug", "\n"; die; }
|
686
|
+
|
687
|
+
# Affiliation string
|
688
|
+
my $affiliation = join ' ', @{ $aff_str };
|
689
|
+
|
690
|
+
# First word
|
691
|
+
my @fields = split /\s/, $aaf_rc->[ 0 ];
|
692
|
+
# Save the relational features of an affiliation (its first word)
|
693
|
+
my $rcs = aff_rcfeatures->new( signals => [],
|
694
|
+
top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
|
695
|
+
page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ] );
|
696
|
+
# Done
|
697
|
+
return ($affiliation, $rcs);
|
698
|
+
}
|
699
|
+
|
700
|
+
# Extract author name and their signal using crf
|
701
|
+
sub AuthorExtraction
|
702
|
+
{
|
703
|
+
my ($features, $rc_features) = @_;
|
704
|
+
|
705
|
+
# Temporary input file for CRF
|
706
|
+
my $infile = BuildTmpFile("aut-input");
|
707
|
+
# Temporary output file for CRF
|
708
|
+
my $outfile = BuildTmpFile("aut-output");
|
709
|
+
|
710
|
+
my $output_handle = undef;
|
711
|
+
# Split and write to temporary input
|
712
|
+
open $output_handle, ">:utf8", $infile;
|
713
|
+
# Split
|
714
|
+
my @lines = split /\n/, $features;
|
715
|
+
# and write
|
716
|
+
foreach my $line (@lines)
|
717
|
+
{
|
718
|
+
if ($line eq "")
|
719
|
+
{
|
720
|
+
print $output_handle "\n";
|
721
|
+
}
|
722
|
+
else
|
723
|
+
{
|
724
|
+
print $output_handle $line, "\t", "ns", "\n";
|
725
|
+
}
|
726
|
+
}
|
727
|
+
# Done
|
728
|
+
close $output_handle;
|
729
|
+
|
730
|
+
# Author model
|
731
|
+
my $author_model = $SectLabel::Config::autFile;
|
732
|
+
|
733
|
+
# Split the authors
|
734
|
+
system("$crft -m $author_model $infile > $outfile");
|
735
|
+
|
736
|
+
# Each author can have one or more signals
|
737
|
+
my %asg = ();
|
738
|
+
# Each author can have only one struct
|
739
|
+
my %aas = ();
|
740
|
+
|
741
|
+
# Each line in the relational features string
|
742
|
+
my @rc_lines = split /\n/, $rc_features;
|
743
|
+
|
744
|
+
my $input_handle = undef;
|
745
|
+
# Read the CRF output
|
746
|
+
open $input_handle, "<:utf8", $outfile;
|
747
|
+
# Author and signal string
|
748
|
+
my $prev_class = "";
|
749
|
+
my @author_str = ();
|
750
|
+
my $signal_str = "";
|
751
|
+
# Relational classifier
|
752
|
+
my @author_rc = ();
|
753
|
+
# Line counter
|
754
|
+
my $counter = 0;
|
755
|
+
# Next to last authors
|
756
|
+
my %ntl_asg = ();
|
757
|
+
#
|
758
|
+
my $is_authors = 0;
|
759
|
+
# Read each line and get its label
|
760
|
+
while (<$input_handle>)
|
761
|
+
{
|
762
|
+
my $line = $_;
|
763
|
+
# Trim
|
764
|
+
$line =~ s/^\s+|\s+$//g;
|
765
|
+
# Blank line mark the end of an author section
|
766
|
+
if ($line eq "")
|
767
|
+
{
|
768
|
+
if ($prev_class eq "author")
|
769
|
+
{
|
770
|
+
my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
|
771
|
+
# Save each author
|
772
|
+
for (my $i = 0; $i < scalar(@{ $authors }); $i++)
|
773
|
+
{
|
774
|
+
$asg{ $authors->[ $i ] } = ();
|
775
|
+
$aas{ $authors->[ $i ] } = $rcs->[ $i ];
|
776
|
+
$ntl_asg{ $authors->[ $i ] } = 0;
|
777
|
+
}
|
778
|
+
}
|
779
|
+
elsif ($prev_class eq "signal")
|
780
|
+
{
|
781
|
+
my $signals = NormalizeAuthorSignal($signal_str);
|
782
|
+
# Save each signal to its corresponding author
|
783
|
+
foreach my $author (keys %ntl_asg)
|
784
|
+
{
|
785
|
+
foreach my $signal (@{ $signals })
|
786
|
+
{
|
787
|
+
push @{ $asg{ $author } }, $signal;
|
788
|
+
push @{ $aas{ $author }->signals }, $signal;
|
789
|
+
}
|
790
|
+
}
|
791
|
+
}
|
792
|
+
|
793
|
+
# Cleanup
|
794
|
+
%ntl_asg = ();
|
795
|
+
# Cleanup
|
796
|
+
@author_str = ();
|
797
|
+
$signal_str = "";
|
798
|
+
@author_rc = ();
|
799
|
+
# Cleanup
|
800
|
+
$prev_class = "";
|
801
|
+
|
802
|
+
# Update the counter
|
803
|
+
$counter++;
|
804
|
+
|
805
|
+
#
|
806
|
+
$is_authors = 0;
|
807
|
+
|
808
|
+
next;
|
809
|
+
}
|
810
|
+
|
811
|
+
# Split the line
|
812
|
+
my @fields = split /\t/, $line;
|
813
|
+
# and extract the class and the content
|
814
|
+
my $class = $fields[ -1 ];
|
815
|
+
my $content = $fields[ 0 ];
|
816
|
+
|
817
|
+
if ($class eq $prev_class)
|
818
|
+
{
|
819
|
+
# An author
|
820
|
+
if ($class eq "author")
|
821
|
+
{
|
822
|
+
push @author_str, $content;
|
823
|
+
push @author_rc, $rc_lines[ $counter ];
|
824
|
+
}
|
825
|
+
# A signal
|
826
|
+
elsif ($class eq "signal")
|
827
|
+
{
|
828
|
+
$signal_str .= $content . " ";
|
829
|
+
}
|
830
|
+
}
|
831
|
+
else
|
832
|
+
{
|
833
|
+
if ($prev_class eq "author")
|
834
|
+
{
|
835
|
+
my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
|
836
|
+
# Save each author
|
837
|
+
for (my $i = 0; $i < scalar(@{ $authors }); $i++)
|
838
|
+
{
|
839
|
+
$asg{ $authors->[ $i ] } = ();
|
840
|
+
$aas{ $authors->[ $i ] } = $rcs->[ $i ];
|
841
|
+
$ntl_asg{ $authors->[ $i ] } = 0;
|
842
|
+
}
|
843
|
+
}
|
844
|
+
elsif ($prev_class eq "signal")
|
845
|
+
{
|
846
|
+
my $signals = NormalizeAuthorSignal($signal_str);
|
847
|
+
# Save each signal to its corresponding author
|
848
|
+
foreach my $author (keys %ntl_asg)
|
849
|
+
{
|
850
|
+
foreach my $signal (@{ $signals })
|
851
|
+
{
|
852
|
+
push @{ $asg{ $author } }, $signal;
|
853
|
+
push @{ $aas{ $author }->signals }, $signal;
|
854
|
+
}
|
855
|
+
}
|
856
|
+
}
|
857
|
+
|
858
|
+
# Clean the next to last author list if this current class is author
|
859
|
+
if (($is_authors == 0) && ($class eq "author")) { %ntl_asg = (); $is_authors = 1; }
|
860
|
+
#
|
861
|
+
if ($class eq "signal") { $is_authors = 0; }
|
862
|
+
|
863
|
+
# Cleanup
|
864
|
+
@author_str = ();
|
865
|
+
$signal_str = "";
|
866
|
+
@author_rc = ();
|
867
|
+
# Switch to the current class
|
868
|
+
$prev_class = $class;
|
869
|
+
|
870
|
+
if ($class eq "author")
|
871
|
+
{
|
872
|
+
push @author_str, $content;
|
873
|
+
push @author_rc, $rc_lines[ $counter ];
|
874
|
+
}
|
875
|
+
elsif ($class eq "signal")
|
876
|
+
{
|
877
|
+
$signal_str .= $content . " ";
|
878
|
+
}
|
879
|
+
}
|
880
|
+
|
881
|
+
# Update the counter
|
882
|
+
$counter++;
|
883
|
+
}
|
884
|
+
|
885
|
+
# Final class
|
886
|
+
if ($prev_class eq "author")
|
887
|
+
{
|
888
|
+
my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
|
889
|
+
# Save each author
|
890
|
+
for (my $i = 0; $i < scalar(@{ $authors }); $i++)
|
891
|
+
{
|
892
|
+
$asg{ $authors->[ $i ] } = ();
|
893
|
+
$aas{ $authors->[ $i ] } = $rcs->[ $i ];
|
894
|
+
$ntl_asg{ $authors->[ $i ] } = 0;
|
895
|
+
}
|
896
|
+
}
|
897
|
+
elsif ($prev_class eq "signal")
|
898
|
+
{
|
899
|
+
my $signals = NormalizeAuthorSignal($signal_str);
|
900
|
+
# Save each signal to its corresponding author
|
901
|
+
foreach my $author (keys %ntl_asg)
|
902
|
+
{
|
903
|
+
foreach my $signal (@{ $signals })
|
904
|
+
{
|
905
|
+
push @{ $asg{ $author } }, $signal;
|
906
|
+
push @{ $aas{ $author }->signals }, $signal;
|
907
|
+
}
|
908
|
+
}
|
909
|
+
}
|
910
|
+
|
911
|
+
# Done
|
912
|
+
close $input_handle;
|
913
|
+
|
914
|
+
# Clean up
|
915
|
+
unlink $infile;
|
916
|
+
unlink $outfile;
|
917
|
+
# Done
|
918
|
+
return (\%asg, \%aas);
|
919
|
+
}
|
920
|
+
|
921
|
+
sub NormalizeAuthorNames
|
922
|
+
{
|
923
|
+
my ($author_str, $author_rc) = @_;
|
924
|
+
|
925
|
+
# Constraint
|
926
|
+
if (scalar(@{ $author_str }) != scalar(@{ $author_rc })) { print STDERR "# It cannot happen, if you encounter it, please consider report it as a bug", "\n"; die; }
|
927
|
+
|
928
|
+
# Mark the beginning of an author name
|
929
|
+
my $begin = 1;
|
930
|
+
# and its corresponding relational features
|
931
|
+
my $rcbegin = 0;
|
932
|
+
|
933
|
+
my @current = ();
|
934
|
+
my @authors = ();
|
935
|
+
my @rcs = ();
|
936
|
+
# Check all tokens in the author string
|
937
|
+
for (my $i = 0; $i < scalar(@{ $author_str }); $i++)
|
938
|
+
{
|
939
|
+
my $token = $author_str->[ $i ];
|
940
|
+
|
941
|
+
# Mark the end of an author name
|
942
|
+
if ($token =~ m/^(&|and|,|;)$/i)
|
943
|
+
{
|
944
|
+
if (scalar(@current) != 0)
|
945
|
+
{
|
946
|
+
push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);
|
947
|
+
|
948
|
+
# Save the relational features of an author (its first word)
|
949
|
+
my @fields = split /\s/, $author_rc->[ $rcbegin ];
|
950
|
+
# Create new record
|
951
|
+
my $tmp = aut_rcfeatures->new( signals => [],
|
952
|
+
top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
|
953
|
+
page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ] );
|
954
|
+
# Save the record
|
955
|
+
push @rcs, $tmp;
|
956
|
+
}
|
957
|
+
|
958
|
+
# Cleanup
|
959
|
+
@current = ();
|
960
|
+
$begin = 1;
|
961
|
+
|
962
|
+
next;
|
963
|
+
}
|
964
|
+
|
965
|
+
# Mark the begin of an author name
|
966
|
+
if ($begin == 1)
|
967
|
+
{
|
968
|
+
push @current, $token;
|
969
|
+
|
970
|
+
$begin = 0;
|
971
|
+
$rcbegin = $i;
|
972
|
+
|
973
|
+
next;
|
974
|
+
}
|
975
|
+
|
976
|
+
# Author name ending with a comma
|
977
|
+
if ($token =~ m/,$/)
|
978
|
+
{
|
979
|
+
push @current, $token;
|
980
|
+
|
981
|
+
if (scalar(@current) != 0)
|
982
|
+
{
|
983
|
+
push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);
|
984
|
+
|
985
|
+
# Save the relational features of an author (its first word)
|
986
|
+
my @fields = split /\s/, $author_rc->[ $rcbegin ];
|
987
|
+
# Create new record
|
988
|
+
my $tmp = aut_rcfeatures->new( signals => [],
|
989
|
+
top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
|
990
|
+
page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ] );
|
991
|
+
# Save the record
|
992
|
+
push @rcs, $tmp;
|
993
|
+
}
|
994
|
+
|
995
|
+
# Cleanup
|
996
|
+
@current = ();
|
997
|
+
$begin = 1;
|
998
|
+
}
|
999
|
+
# or it's just parts of the name
|
1000
|
+
else
|
1001
|
+
{
|
1002
|
+
push @current, $token;
|
1003
|
+
}
|
1004
|
+
}
|
1005
|
+
|
1006
|
+
# Last author name
|
1007
|
+
if (scalar(@current) != 0)
|
1008
|
+
{
|
1009
|
+
push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);
|
1010
|
+
|
1011
|
+
# Save the relational features of an author (its first word)
|
1012
|
+
my @fields = split /\s/, $author_rc->[ $rcbegin ];
|
1013
|
+
# Create new record
|
1014
|
+
my $tmp = aut_rcfeatures->new( signals => [],
|
1015
|
+
top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
|
1016
|
+
page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ] );
|
1017
|
+
# Save the record
|
1018
|
+
push @rcs, $tmp;
|
1019
|
+
}
|
1020
|
+
|
1021
|
+
# Done
|
1022
|
+
return (\@authors, \@rcs);
|
1023
|
+
}
|
1024
|
+
|
1025
|
+
#
|
1026
|
+
sub NormalizeAuthorSignal
|
1027
|
+
{
|
1028
|
+
my ($signal_str) = @_;
|
1029
|
+
|
1030
|
+
# Trim
|
1031
|
+
$signal_str =~ s/^\s+|\s+$//g;
|
1032
|
+
# Split into individual signal
|
1033
|
+
my @signals = split / |,|:|;/, $signal_str;
|
1034
|
+
|
1035
|
+
# Done
|
1036
|
+
return \@signals;
|
1037
|
+
}
|
1038
|
+
|
1039
|
+
# Extract features from affiliation lines
|
1040
|
+
# The list of features include
|
1041
|
+
# Content
|
1042
|
+
# Content, lower case, no punctuation
|
1043
|
+
# Content length
|
1044
|
+
# First word in line
|
1045
|
+
#
|
1046
|
+
# XML features
|
1047
|
+
# Subscript, superscript
|
1048
|
+
# Bold
|
1049
|
+
# Italic
|
1050
|
+
# Underline
|
1051
|
+
# Relative font size
|
1052
|
+
# Differentiate features
|
1053
|
+
sub AffiliationFeatureExtraction
|
1054
|
+
{
|
1055
|
+
my ($aff_lines, $aff_addrs) = @_;
|
1056
|
+
|
1057
|
+
# NOTE: Relational classifier features
|
1058
|
+
my $rc_features = "";
|
1059
|
+
|
1060
|
+
# Features will be stored here
|
1061
|
+
my $features = "";
|
1062
|
+
# First word in line
|
1063
|
+
my $is_first_line = undef;
|
1064
|
+
|
1065
|
+
# Font size
|
1066
|
+
my %fonts = ();
|
1067
|
+
# Each line contains many runs
|
1068
|
+
foreach my $line (@{ $aff_lines })
|
1069
|
+
{
|
1070
|
+
my $runs = $line->get_objs_ref();
|
1071
|
+
# Iterator though all work in all lines
|
1072
|
+
foreach my $run (@{ $runs })
|
1073
|
+
{
|
1074
|
+
my $fsize = $run->get_font_size();
|
1075
|
+
my $words = $run->get_objs_ref();
|
1076
|
+
|
1077
|
+
# Statistic
|
1078
|
+
if (! exists $fonts{ $fsize })
|
1079
|
+
{
|
1080
|
+
$fonts{ $fsize } = scalar(@{ $words });
|
1081
|
+
}
|
1082
|
+
else
|
1083
|
+
{
|
1084
|
+
$fonts{ $fsize } += scalar(@{ $words });
|
1085
|
+
}
|
1086
|
+
}
|
1087
|
+
}
|
1088
|
+
|
1089
|
+
my $dominate_font = undef;
|
1090
|
+
# Sort all the font descend with the number of their appearance
|
1091
|
+
my @sorted = sort { $fonts{ $b } <=> $fonts{ $a } } keys %fonts;
|
1092
|
+
# Select the dominated font
|
1093
|
+
$dominate_font = $sorted[ 0 ];
|
1094
|
+
|
1095
|
+
my $size_mismatch = undef;
|
1096
|
+
# TODO: serious error if the size of aff_lines and the size of aff_addrs mismatch
|
1097
|
+
if (scalar(@{ $aff_lines }) != scalar(@{ $aff_addrs }))
|
1098
|
+
{
|
1099
|
+
$size_mismatch = 1;
|
1100
|
+
# Print the error but still try to continue
|
1101
|
+
print STDERR "# Total number of affiliation lines (" . scalar(@{ $aff_lines }) . ") != Total number of affiliation addresses (" . scalar(@{ $aff_addrs }) . ")." . "\n";
|
1102
|
+
}
|
1103
|
+
|
1104
|
+
my $prev_page = undef;
|
1105
|
+
my $prev_sect = undef;
|
1106
|
+
my $prev_para = undef;
|
1107
|
+
# Each line contains many runs
|
1108
|
+
for (my $counter = 0; $counter < scalar(@{ $aff_lines }); $counter++)
|
1109
|
+
{
|
1110
|
+
# Get the line object
|
1111
|
+
my $line = $aff_lines->[ $counter ];
|
1112
|
+
|
1113
|
+
# Check the size of aff_lines and aff_addrs
|
1114
|
+
if (! defined $size_mismatch)
|
1115
|
+
{
|
1116
|
+
# Check if two consecutive lines are from two different sections
|
1117
|
+
if (! defined $prev_page)
|
1118
|
+
{
|
1119
|
+
# Init
|
1120
|
+
$prev_page = $aff_addrs->[ $counter ]->{ 'L1' };
|
1121
|
+
$prev_sect = $aff_addrs->[ $counter ]->{ 'L2' };
|
1122
|
+
$prev_para = $aff_addrs->[ $counter ]->{ 'L3' };
|
1123
|
+
}
|
1124
|
+
else
|
1125
|
+
{
|
1126
|
+
# Affiliations from different sections will be separated immediately
|
1127
|
+
if (($prev_page != $aff_addrs->[ $counter ]->{ 'L1' }) ||
|
1128
|
+
($prev_sect != $aff_addrs->[ $counter ]->{ 'L2' }) ||
|
1129
|
+
($prev_para != $aff_addrs->[ $counter ]->{ 'L3' }))
|
1130
|
+
{
|
1131
|
+
$features .= "\n";
|
1132
|
+
|
1133
|
+
# NOTE: Relational classifier features
|
1134
|
+
$rc_features .= "\n";
|
1135
|
+
}
|
1136
|
+
|
1137
|
+
# Save the paragraph index
|
1138
|
+
$prev_page = $aff_addrs->[ $counter ]->{ 'L1' };
|
1139
|
+
$prev_sect = $aff_addrs->[ $counter ]->{ 'L2' };
|
1140
|
+
$prev_para = $aff_addrs->[ $counter ]->{ 'L3' };
|
1141
|
+
}
|
1142
|
+
}
|
1143
|
+
|
1144
|
+
# Set first word in line
|
1145
|
+
$is_first_line = 1;
|
1146
|
+
|
1147
|
+
# Two previous words
|
1148
|
+
my $prev_word = undef;
|
1149
|
+
my $prev_prev_word = undef;
|
1150
|
+
|
1151
|
+
# Format of the previous word
|
1152
|
+
my ($prev_bold, $prev_italic, $prev_underline, $prev_suscript, $prev_fontsize) = "unknown";
|
1153
|
+
|
1154
|
+
my $runs = $line->get_objs_ref();
|
1155
|
+
# Iterator though all work in all lines
|
1156
|
+
foreach my $run (@{ $runs })
|
1157
|
+
{
|
1158
|
+
# The run must be non-empty
|
1159
|
+
my $tmp = $run->get_content();
|
1160
|
+
# Trim
|
1161
|
+
$tmp =~ s/^\s+|\s+$//g;
|
1162
|
+
# Skip blank run
|
1163
|
+
if ($tmp eq "") { next; }
|
1164
|
+
|
1165
|
+
###
|
1166
|
+
# The following features are XML features
|
1167
|
+
###
|
1168
|
+
|
1169
|
+
# Bold format
|
1170
|
+
my $bold = ($run->get_bold() eq "true") ? "bold" : "none";
|
1171
|
+
|
1172
|
+
# Italic format
|
1173
|
+
my $italic = ($run->get_italic() eq "true") ? "italic" : "none";
|
1174
|
+
|
1175
|
+
# Underline
|
1176
|
+
my $underline = ($run->get_underline() eq "true") ? "underline" : "none";
|
1177
|
+
|
1178
|
+
# Sub-Sup-script
|
1179
|
+
my $suscript = ($run->get_suscript() eq "superscript") ? "super" :
|
1180
|
+
($run->get_suscript() eq "subscript") ? "sub" : "none";
|
1181
|
+
|
1182
|
+
# Relative font size
|
1183
|
+
my $fontsize = ($run->get_font_size() > $dominate_font) ? "large" :
|
1184
|
+
($run->get_font_size() < $dominate_font) ? "small" : "normal";
|
1185
|
+
|
1186
|
+
###
|
1187
|
+
# End of XML features
|
1188
|
+
###
|
1189
|
+
|
1190
|
+
# All words in the run
|
1191
|
+
my $words = $run->get_objs_ref();
|
1192
|
+
|
1193
|
+
# For each word
|
1194
|
+
foreach my $word (@{ $words })
|
1195
|
+
{
|
1196
|
+
# Get word location
|
1197
|
+
my $top = $word->get_top_pos();
|
1198
|
+
my $bottom = $word->get_bottom_pos();
|
1199
|
+
my $left = $word->get_left_pos();
|
1200
|
+
my $right = $word->get_right_pos();
|
1201
|
+
|
1202
|
+
# NOTE: heuristic rule, for words in the same line
|
1203
|
+
# If the x-axis distance between this word and the previous word is
|
1204
|
+
# three times larger than the distance between the previous word and
|
1205
|
+
# the word before it, then it marks the separator.
|
1206
|
+
# The better way to do this is to introduce it as a new feature in the
|
1207
|
+
# author and affiliation model but this step requires re-training these
|
1208
|
+
# two models, so ...
|
1209
|
+
#
|
1210
|
+
# NOTE: Assuming left to right writing
|
1211
|
+
if (! defined $prev_word)
|
1212
|
+
{
|
1213
|
+
$prev_word = $word;
|
1214
|
+
}
|
1215
|
+
elsif (! defined $prev_prev_word)
|
1216
|
+
{
|
1217
|
+
# NOTE: Words have the power to both destroy and heal, when words are both
|
1218
|
+
# true and kind, they can change our world
|
1219
|
+
if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
|
1220
|
+
{
|
1221
|
+
$prev_prev_word = $prev_word;
|
1222
|
+
$prev_word = $word;
|
1223
|
+
}
|
1224
|
+
}
|
1225
|
+
else
|
1226
|
+
{
|
1227
|
+
# NOTE: Words have the power to both destroy and heal, when words are both
|
1228
|
+
# true and kind, they can change our world
|
1229
|
+
if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
|
1230
|
+
{
|
1231
|
+
my $prev_dist = abs ($prev_word->get_left_pos() - $prev_prev_word->get_right_pos());
|
1232
|
+
my $curr_dist = abs ($word->get_left_pos() - $prev_word->get_right_pos());
|
1233
|
+
|
1234
|
+
if ($prev_dist * 5 < $curr_dist)
|
1235
|
+
{
|
1236
|
+
$features .= "\n";
|
1237
|
+
|
1238
|
+
# NOTE: Relational classifier features
|
1239
|
+
$rc_features .= "\n";
|
1240
|
+
}
|
1241
|
+
|
1242
|
+
$prev_prev_word = $prev_word;
|
1243
|
+
$prev_word = $word;
|
1244
|
+
}
|
1245
|
+
}
|
1246
|
+
|
1247
|
+
# Extract features
|
1248
|
+
my $full_content = $word->get_content();
|
1249
|
+
# Trim
|
1250
|
+
$full_content =~ s/^\s+|\s+$//g;
|
1251
|
+
|
1252
|
+
# Skip blank run
|
1253
|
+
if ($full_content eq "") { next; }
|
1254
|
+
|
1255
|
+
my @sub_content = ();
|
1256
|
+
# This is the tricky part, one word e.g. **affiliation will be
|
1257
|
+
# splitted into two parts: the signal, and the affiliation if
|
1258
|
+
# possible using regular expression
|
1259
|
+
while ($full_content =~ m/([\w|-]*)(\W*)/g)
|
1260
|
+
{
|
1261
|
+
my $first = $1;
|
1262
|
+
my $second = $2;
|
1263
|
+
|
1264
|
+
# Trim
|
1265
|
+
$first =~ s/^\s+|\s+$//g;
|
1266
|
+
$second =~ s/^\s+|\s+$//g;
|
1267
|
+
|
1268
|
+
# Only keep non-blank content
|
1269
|
+
if ($first ne "") { push @sub_content, $first; }
|
1270
|
+
|
1271
|
+
# Check the signal and separator
|
1272
|
+
while ($second =~ m/([,|\.|:|;]*)([^,\.:;]*)/g)
|
1273
|
+
{
|
1274
|
+
my $sub_first = $1;
|
1275
|
+
my $sub_second = $2;
|
1276
|
+
|
1277
|
+
# Trim
|
1278
|
+
$sub_first =~ s/^\s+|\s+$//g;
|
1279
|
+
$sub_second =~ s/^\s+|\s+$//g;
|
1280
|
+
|
1281
|
+
# Only keep non-blank separator
|
1282
|
+
if ($sub_first ne "") { push @sub_content, $sub_first; }
|
1283
|
+
# Only keep non-blank signal
|
1284
|
+
if ($sub_second ne "") { push @sub_content, $sub_second; }
|
1285
|
+
}
|
1286
|
+
}
|
1287
|
+
|
1288
|
+
foreach my $content (@sub_content)
|
1289
|
+
{
|
1290
|
+
# Content
|
1291
|
+
$features .= $content . "\t";
|
1292
|
+
|
1293
|
+
my $content_n = $content;
|
1294
|
+
# Remove punctuation
|
1295
|
+
$content_n =~ s/[^\w]//g;
|
1296
|
+
# Lower case
|
1297
|
+
my $content_l = lc($content);
|
1298
|
+
# Lower case, no punctuation
|
1299
|
+
my $content_nl = lc($content_n);
|
1300
|
+
# Lower case
|
1301
|
+
$features .= $content_l . "\t";
|
1302
|
+
# Lower case, no punctuation
|
1303
|
+
if ($content_nl ne "")
|
1304
|
+
{
|
1305
|
+
$features .= $content_nl . "\t";
|
1306
|
+
}
|
1307
|
+
else
|
1308
|
+
{
|
1309
|
+
$features .= $content_l . "\t";
|
1310
|
+
}
|
1311
|
+
|
1312
|
+
# Split into character
|
1313
|
+
my @chars = split(//, $content);
|
1314
|
+
# Content length
|
1315
|
+
my $length = (scalar(@chars) == 1) ? "1-char" :
|
1316
|
+
(scalar(@chars) == 2) ? "2-char" :
|
1317
|
+
(scalar(@chars) == 3) ? "3-char" : "4+char";
|
1318
|
+
$features .= $length . "\t";
|
1319
|
+
|
1320
|
+
# First word in line
|
1321
|
+
if ($is_first_line == 1)
|
1322
|
+
{
|
1323
|
+
$features .= "begin" . "\t";
|
1324
|
+
|
1325
|
+
# Next words are not the first in line anymore
|
1326
|
+
$is_first_line = 0;
|
1327
|
+
}
|
1328
|
+
else
|
1329
|
+
{
|
1330
|
+
$features .= "continue" . "\t";
|
1331
|
+
}
|
1332
|
+
|
1333
|
+
###
|
1334
|
+
# The following features are XML features
|
1335
|
+
###
|
1336
|
+
|
1337
|
+
# Bold format
|
1338
|
+
$features .= $bold . "\t";
|
1339
|
+
|
1340
|
+
# Italic format
|
1341
|
+
$features .= $italic . "\t";
|
1342
|
+
|
1343
|
+
# Underline
|
1344
|
+
$features .= $underline . "\t";
|
1345
|
+
|
1346
|
+
# Sub-Sup-script
|
1347
|
+
$features .= $suscript . "\t";
|
1348
|
+
|
1349
|
+
# Relative font size
|
1350
|
+
$features .= $fontsize . "\t";
|
1351
|
+
|
1352
|
+
# First word in run
|
1353
|
+
if (($prev_bold ne $bold) || ($prev_italic ne $italic) || ($prev_underline ne $underline) || ($prev_suscript ne $suscript) || ($prev_fontsize ne $fontsize))
|
1354
|
+
{
|
1355
|
+
$features .= "fbegin" . "\t";
|
1356
|
+
}
|
1357
|
+
else
|
1358
|
+
{
|
1359
|
+
$features .= "fcontinue" . "\t";
|
1360
|
+
}
|
1361
|
+
|
1362
|
+
# New token
|
1363
|
+
$features .= "\n";
|
1364
|
+
|
1365
|
+
# Save the XML format
|
1366
|
+
$prev_bold = $bold;
|
1367
|
+
$prev_italic = $italic;
|
1368
|
+
$prev_underline = $underline;
|
1369
|
+
$prev_suscript = $suscript;
|
1370
|
+
$prev_fontsize = $fontsize;
|
1371
|
+
|
1372
|
+
# NOTE: Relational classifier features
|
1373
|
+
# Content
|
1374
|
+
$rc_features .= $content . "\t";
|
1375
|
+
# Location
|
1376
|
+
$rc_features .= $top . "\t";
|
1377
|
+
$rc_features .= $bottom . "\t";
|
1378
|
+
$rc_features .= $left . "\t";
|
1379
|
+
$rc_features .= $right . "\t";
|
1380
|
+
# Index
|
1381
|
+
if (! defined $size_mismatch)
|
1382
|
+
{
|
1383
|
+
$rc_features .= $aff_addrs->[ $counter ]->{ 'L1' } . "\t";
|
1384
|
+
$rc_features .= $aff_addrs->[ $counter ]->{ 'L2' } . "\t";
|
1385
|
+
$rc_features .= $aff_addrs->[ $counter ]->{ 'L3' } . "\t";
|
1386
|
+
$rc_features .= $aff_addrs->[ $counter ]->{ 'L4' } . "\t";
|
1387
|
+
}
|
1388
|
+
# Done
|
1389
|
+
$rc_features .= "\n";
|
1390
|
+
}
|
1391
|
+
}
|
1392
|
+
}
|
1393
|
+
}
|
1394
|
+
|
1395
|
+
return ($features, $rc_features);
|
1396
|
+
|
1397
|
+
}
|
1398
|
+
|
1399
|
+
# Extract features from author lines
|
1400
|
+
# The list of features include
|
1401
|
+
# Content
|
1402
|
+
# Content, lower case, no punctuation
|
1403
|
+
# Content length
|
1404
|
+
# Capitalization
|
1405
|
+
# Numeric property
|
1406
|
+
# Last punctuation
|
1407
|
+
# First 4-gram
|
1408
|
+
# Last 4-gram
|
1409
|
+
# Dictionary
|
1410
|
+
# First word in line
|
1411
|
+
#
|
1412
|
+
# XML features
|
1413
|
+
# Subscript, superscript
|
1414
|
+
# Bold
|
1415
|
+
# Italic
|
1416
|
+
# Underline
|
1417
|
+
# Relative font size
|
1418
|
+
# Differentiate features
|
1419
|
+
sub AuthorFeatureExtraction
|
1420
|
+
{
|
1421
|
+
my ($aut_lines, $aut_addrs) = @_;
|
1422
|
+
|
1423
|
+
# NOTE: Relational classifier features
|
1424
|
+
my $rc_features = "";
|
1425
|
+
|
1426
|
+
# Features will be stored here
|
1427
|
+
my $features = "";
|
1428
|
+
# First word in line
|
1429
|
+
my $is_first_line = undef;
|
1430
|
+
# First word in run
|
1431
|
+
# my $is_first_run = undef;
|
1432
|
+
|
1433
|
+
# Font size
|
1434
|
+
my %fonts = ();
|
1435
|
+
# Each line contains many runs
|
1436
|
+
foreach my $line (@{ $aut_lines })
|
1437
|
+
{
|
1438
|
+
my $runs = $line->get_objs_ref();
|
1439
|
+
# Iterator though all work in all lines
|
1440
|
+
foreach my $run (@{ $runs })
|
1441
|
+
{
|
1442
|
+
my $fsize = $run->get_font_size();
|
1443
|
+
my $words = $run->get_objs_ref();
|
1444
|
+
|
1445
|
+
# Statistic
|
1446
|
+
if (! exists $fonts{ $fsize })
|
1447
|
+
{
|
1448
|
+
$fonts{ $fsize } = scalar(@{ $words });
|
1449
|
+
}
|
1450
|
+
else
|
1451
|
+
{
|
1452
|
+
$fonts{ $fsize } += scalar(@{ $words });
|
1453
|
+
}
|
1454
|
+
}
|
1455
|
+
}
|
1456
|
+
|
1457
|
+
my $dominate_font = undef;
|
1458
|
+
# Sort all the font descend with the number of their appearance
|
1459
|
+
my @sorted = sort { $fonts{ $b } <=> $fonts{ $a } } keys %fonts;
|
1460
|
+
# Select the dominated font
|
1461
|
+
$dominate_font = $sorted[ 0 ];
|
1462
|
+
|
1463
|
+
my $size_mismatch = undef;
|
1464
|
+
# TODO: serious error if the size of aut_lines and the size of aut_addrs mismatch
|
1465
|
+
if (scalar(@{ $aut_lines }) != scalar(@{ $aut_addrs }))
|
1466
|
+
{
|
1467
|
+
$size_mismatch = 1;
|
1468
|
+
# Print the error but still try to continue
|
1469
|
+
print STDERR "# Total number of author lines (" . scalar(@{ $aut_lines }) . ") != Total number of author addresses (" . scalar(@{ $aut_addrs }) . ")." . "\n";
|
1470
|
+
}
|
1471
|
+
|
1472
|
+
my $prev_page = undef;
|
1473
|
+
my $prev_sect = undef;
|
1474
|
+
my $prev_para = undef;
|
1475
|
+
# Each line contains many runs
|
1476
|
+
for (my $counter = 0; $counter < scalar(@{ $aut_lines }); $counter++)
|
1477
|
+
{
|
1478
|
+
# Get the line object
|
1479
|
+
my $line = $aut_lines->[ $counter ];
|
1480
|
+
|
1481
|
+
# Check the size of aut_line and aut_addrs
|
1482
|
+
if (! defined $size_mismatch)
|
1483
|
+
{
|
1484
|
+
# Check if two consecutive lines are from two different sections
|
1485
|
+
if (! defined $prev_page)
|
1486
|
+
{
|
1487
|
+
# Init
|
1488
|
+
$prev_page = $aut_addrs->[ $counter ]->{ 'L1' };
|
1489
|
+
$prev_sect = $aut_addrs->[ $counter ]->{ 'L2' };
|
1490
|
+
$prev_para = $aut_addrs->[ $counter ]->{ 'L3' };
|
1491
|
+
}
|
1492
|
+
else
|
1493
|
+
{
|
1494
|
+
# Authors from different sections will be separated immediately
|
1495
|
+
if (($prev_page != $aut_addrs->[ $counter ]->{ 'L1' }) ||
|
1496
|
+
($prev_sect != $aut_addrs->[ $counter ]->{ 'L2' }) ||
|
1497
|
+
($prev_para != $aut_addrs->[ $counter ]->{ 'L3' }))
|
1498
|
+
{
|
1499
|
+
$features .= "\n";
|
1500
|
+
|
1501
|
+
# NOTE: Relational classifier features
|
1502
|
+
$rc_features .= "\n";
|
1503
|
+
}
|
1504
|
+
|
1505
|
+
# Save the paragraph index
|
1506
|
+
$prev_page = $aut_addrs->[ $counter ]->{ 'L1' };
|
1507
|
+
$prev_sect = $aut_addrs->[ $counter ]->{ 'L2' };
|
1508
|
+
$prev_para = $aut_addrs->[ $counter ]->{ 'L3' };
|
1509
|
+
}
|
1510
|
+
}
|
1511
|
+
|
1512
|
+
# Set first word in line
|
1513
|
+
$is_first_line = 1;
|
1514
|
+
|
1515
|
+
# Previous word and the word before this
|
1516
|
+
my $prev_prev_word = undef;
|
1517
|
+
my $prev_word = undef;
|
1518
|
+
|
1519
|
+
# Format of the previous word
|
1520
|
+
my ($prev_bold, $prev_italic, $prev_underline, $prev_suscript, $prev_fontsize) = "unknown";
|
1521
|
+
|
1522
|
+
my $runs = $line->get_objs_ref();
|
1523
|
+
# Iterator though all work in all lines
|
1524
|
+
foreach my $run (@{ $runs })
|
1525
|
+
{
|
1526
|
+
# The run must be non-empty
|
1527
|
+
my $tmp = $run->get_content();
|
1528
|
+
# Trim
|
1529
|
+
$tmp =~ s/^\s+|\s+$//g;
|
1530
|
+
# Skip blank run
|
1531
|
+
if ($tmp eq "") { next; }
|
1532
|
+
|
1533
|
+
# Set first word in run
|
1534
|
+
# $is_first_run = 1;
|
1535
|
+
|
1536
|
+
###
|
1537
|
+
# The following features are XML features
|
1538
|
+
###
|
1539
|
+
|
1540
|
+
# Bold format
|
1541
|
+
my $bold = ($run->get_bold() eq "true") ? "bold" : "none";
|
1542
|
+
|
1543
|
+
# Italic format
|
1544
|
+
my $italic = ($run->get_italic() eq "true") ? "italic" : "none";
|
1545
|
+
|
1546
|
+
# Underline
|
1547
|
+
my $underline = ($run->get_underline() eq "true") ? "underline" : "none";
|
1548
|
+
|
1549
|
+
# Sub-Sup-script
|
1550
|
+
my $suscript = ($run->get_suscript() eq "superscript") ? "super" :
|
1551
|
+
($run->get_suscript() eq "subscript") ? "sub" : "none";
|
1552
|
+
|
1553
|
+
# Relative font size
|
1554
|
+
my $fontsize = ($run->get_font_size() > $dominate_font) ? "large" :
|
1555
|
+
($run->get_font_size() < $dominate_font) ? "small" : "normal";
|
1556
|
+
|
1557
|
+
###
|
1558
|
+
# End of XML features
|
1559
|
+
###
|
1560
|
+
|
1561
|
+
# All words in the run
|
1562
|
+
my $words = $run->get_objs_ref();
|
1563
|
+
|
1564
|
+
# For each word
|
1565
|
+
foreach my $word (@{ $words })
|
1566
|
+
{
|
1567
|
+
# Get word location
|
1568
|
+
my $top = $word->get_top_pos();
|
1569
|
+
my $bottom = $word->get_bottom_pos();
|
1570
|
+
my $left = $word->get_left_pos();
|
1571
|
+
my $right = $word->get_right_pos();
|
1572
|
+
|
1573
|
+
# NOTE: heuristic rule, for words in the same line
|
1574
|
+
# If the x-axis distance between this word and the previous word is
|
1575
|
+
# three times larger than the distance between the previous word and
|
1576
|
+
# the word before it, then it marks the separator.
|
1577
|
+
# The better way to do this is to introduce it as a new feature in the
|
1578
|
+
# author and affiliation model but this step requires re-training these
|
1579
|
+
# two models, so ...
|
1580
|
+
#
|
1581
|
+
# NOTE: Assuming left to right writing
|
1582
|
+
if (! defined $prev_word)
|
1583
|
+
{
|
1584
|
+
$prev_word = $word;
|
1585
|
+
}
|
1586
|
+
elsif (! defined $prev_prev_word)
|
1587
|
+
{
|
1588
|
+
# NOTE: Words have the power to both destroy and heal, when words are both
|
1589
|
+
# true and kind, they can change our world
|
1590
|
+
if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
|
1591
|
+
{
|
1592
|
+
$prev_prev_word = $prev_word;
|
1593
|
+
$prev_word = $word;
|
1594
|
+
}
|
1595
|
+
}
|
1596
|
+
else
|
1597
|
+
{
|
1598
|
+
# NOTE: Words have the power to both destroy and heal, when words are both
|
1599
|
+
# true and kind, they can change our world
|
1600
|
+
if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
|
1601
|
+
{
|
1602
|
+
|
1603
|
+
my $prev_dist = abs ($prev_word->get_left_pos() - $prev_prev_word->get_right_pos());
|
1604
|
+
my $curr_dist = abs ($word->get_left_pos() - $prev_word->get_right_pos());
|
1605
|
+
|
1606
|
+
if ($prev_dist * 5 < $curr_dist)
|
1607
|
+
{
|
1608
|
+
$features .= "\n";
|
1609
|
+
|
1610
|
+
# NOTE: Relational classifier features
|
1611
|
+
$rc_features .= "\n";
|
1612
|
+
}
|
1613
|
+
|
1614
|
+
$prev_prev_word = $prev_word;
|
1615
|
+
$prev_word = $word;
|
1616
|
+
}
|
1617
|
+
}
|
1618
|
+
|
1619
|
+
# Extract features
|
1620
|
+
my $full_content = $word->get_content();
|
1621
|
+
# Trim
|
1622
|
+
$full_content =~ s/^\s+|\s+$//g;
|
1623
|
+
|
1624
|
+
# Skip blank run
|
1625
|
+
if ($full_content eq "") { next; }
|
1626
|
+
|
1627
|
+
my @sub_content = ();
|
1628
|
+
# This is the tricky part, one word e.g. name** will be splitted
|
1629
|
+
# into several parts: the name, the signal, and the separator if
|
1630
|
+
# possible using regular expression
|
1631
|
+
while ($full_content =~ m/([\w|-]*)(\W*)/g)
|
1632
|
+
{
|
1633
|
+
my $first = $1;
|
1634
|
+
my $second = $2;
|
1635
|
+
|
1636
|
+
# Trim
|
1637
|
+
$first =~ s/^\s+|\s+$//g;
|
1638
|
+
$second =~ s/^\s+|\s+$//g;
|
1639
|
+
|
1640
|
+
# Only keep non-blank content
|
1641
|
+
if ($first ne "") { push @sub_content, $first; }
|
1642
|
+
|
1643
|
+
# Check the signal and separator
|
1644
|
+
while ($second =~ m/([,|\.|:|;]*)([^,\.:;]*)/g)
|
1645
|
+
{
|
1646
|
+
my $sub_first = $1;
|
1647
|
+
my $sub_second = $2;
|
1648
|
+
|
1649
|
+
# Trim
|
1650
|
+
$sub_first =~ s/^\s+|\s+$//g;
|
1651
|
+
$sub_second =~ s/^\s+|\s+$//g;
|
1652
|
+
|
1653
|
+
# Only keep non-blank separator
|
1654
|
+
if ($sub_first ne "") { push @sub_content, $sub_first; }
|
1655
|
+
# Only keep non-blank signal
|
1656
|
+
if ($sub_second ne "") { push @sub_content, $sub_second; }
|
1657
|
+
}
|
1658
|
+
}
|
1659
|
+
|
1660
|
+
foreach my $content (@sub_content)
|
1661
|
+
{
|
1662
|
+
# Content
|
1663
|
+
$features .= $content . "\t";
|
1664
|
+
|
1665
|
+
my $content_n = $content;
|
1666
|
+
# Remove punctuation
|
1667
|
+
$content_n =~ s/[^\w]//g;
|
1668
|
+
# Lower case
|
1669
|
+
my $content_l = lc($content);
|
1670
|
+
# Lower case, no punctuation
|
1671
|
+
my $content_nl = lc($content_n);
|
1672
|
+
# Lower case
|
1673
|
+
$features .= $content_l . "\t";
|
1674
|
+
# Lower case, no punctuation
|
1675
|
+
if ($content_nl ne "")
|
1676
|
+
{
|
1677
|
+
$features .= $content_nl . "\t";
|
1678
|
+
}
|
1679
|
+
else
|
1680
|
+
{
|
1681
|
+
$features .= $content_l . "\t";
|
1682
|
+
}
|
1683
|
+
|
1684
|
+
# Capitalization
|
1685
|
+
my $ortho = ($content =~ /^[\p{IsUpper}]$/) ? "single" :
|
1686
|
+
($content =~ /^[\p{IsUpper}][\p{IsLower}]+/) ? "init" :
|
1687
|
+
($content =~ /^[\p{IsUpper}]+$/) ? "all" : "others";
|
1688
|
+
$features .= $ortho . "\t";
|
1689
|
+
|
1690
|
+
# Numeric property
|
1691
|
+
my $num = ($content =~ /^[0-9]$/) ? "1dig" :
|
1692
|
+
($content =~ /^[0-9][0-9]$/) ? "2dig" :
|
1693
|
+
($content =~ /^[0-9][0-9][0-9]$/) ? "3dig" :
|
1694
|
+
($content =~ /^[0-9]+$/) ? "4+dig" :
|
1695
|
+
($content =~ /^[0-9]+(th|st|nd|rd)$/) ? "ordinal" :
|
1696
|
+
($content =~ /[0-9]/) ? "hasdig" : "nonnum";
|
1697
|
+
$features .= $num . "\t";
|
1698
|
+
|
1699
|
+
# Last punctuation
|
1700
|
+
my $punct = ($content =~ /^[\"\'\`]/) ? "leadq" :
|
1701
|
+
($content =~ /[\"\'\`][^s]?$/) ? "endq" :
|
1702
|
+
($content =~ /\-.*\-/) ? "multi" :
|
1703
|
+
($content =~ /[\-\,\:\;]$/) ? "cont" :
|
1704
|
+
($content =~ /[\!\?\.\"\']$/) ? "stop" :
|
1705
|
+
($content =~ /^[\(\[\{\<].+[\)\]\}\>].?$/) ? "braces" : "others";
|
1706
|
+
$features .= $punct . "\t";
|
1707
|
+
|
1708
|
+
# Split into character
|
1709
|
+
my @chars = split(//, $content);
|
1710
|
+
my $clen = scalar @chars;
|
1711
|
+
# Content length
|
1712
|
+
my $length = (scalar(@chars) == 1) ? "1-char" :
|
1713
|
+
(scalar(@chars) == 2) ? "2-char" :
|
1714
|
+
(scalar(@chars) == 3) ? "3-char" : "4+char";
|
1715
|
+
$features .= $length . "\t";
|
1716
|
+
# First n-gram
|
1717
|
+
$features .= $chars[ 0 ] . "\t";
|
1718
|
+
if ($clen >= 2) {
|
1719
|
+
$features .= join("", @chars[ 0..1 ]) . "\t";
|
1720
|
+
} else {
|
1721
|
+
$features .= $length . "\t";
|
1722
|
+
}
|
1723
|
+
if ($clen >= 3) {
|
1724
|
+
$features .= join("", @chars[ 0..2 ]) . "\t";
|
1725
|
+
} elsif ($clen >= 2) {
|
1726
|
+
$features .= join("", @chars[ 0..1 ]) . "\t";
|
1727
|
+
} else {
|
1728
|
+
$features .= $length . "\t";
|
1729
|
+
}
|
1730
|
+
if ($clen >= 4) {
|
1731
|
+
$features .= join("", @chars[ 0..3 ]) . "\t";
|
1732
|
+
} elsif ($clen >= 3) {
|
1733
|
+
$features .= join("", @chars[ 0..2 ]) . "\t";
|
1734
|
+
} elsif ($clen >= 2) {
|
1735
|
+
$features .= join("", @chars[ 0..1 ]) . "\t";
|
1736
|
+
} else {
|
1737
|
+
$features .= $length . "\t";
|
1738
|
+
}
|
1739
|
+
# Last n-gram
|
1740
|
+
$features .= $chars[ -1 ] . "\t";
|
1741
|
+
if ($clen >= 2) {
|
1742
|
+
$features .= join("", @chars[ -2..-1 ]) . "\t";
|
1743
|
+
} else {
|
1744
|
+
$features .= $chars[ -1 ] . "\t";
|
1745
|
+
}
|
1746
|
+
if ($clen >= 3) {
|
1747
|
+
$features .= join("", @chars[ -3..-1 ]) . "\t";
|
1748
|
+
} elsif ($clen >= 2) {
|
1749
|
+
$features .= join("", @chars[ -2..-1 ]) . "\t";
|
1750
|
+
} else {
|
1751
|
+
$features .= $chars[ -1 ] . "\t";
|
1752
|
+
}
|
1753
|
+
if ($clen >= 4) {
|
1754
|
+
$features .= join("", @chars[ -4..-1 ]) . "\t";
|
1755
|
+
} elsif ($clen >= 3) {
|
1756
|
+
$features .= join("", @chars[ -3..-1 ]) . "\t";
|
1757
|
+
} elsif ($clen >= 2) {
|
1758
|
+
$features .= join("", @chars[ -2..-1 ]) . "\t";
|
1759
|
+
} else {
|
1760
|
+
$features .= $chars[ -1 ] . "\t";
|
1761
|
+
}
|
1762
|
+
|
1763
|
+
# Dictionary
|
1764
|
+
my $dict_status = (defined $dict{ $content_nl }) ? $dict{ $content_nl } : 0;
|
1765
|
+
# Possible names
|
1766
|
+
my ($publisher_name, $place_name, $month_name, $last_name, $female_name, $male_name) = undef;
|
1767
|
+
# Check all case
|
1768
|
+
if ($dict_status >= 32) { $dict_status -= 32; $publisher_name = "publisher" } else { $publisher_name = "no"; }
|
1769
|
+
if ($dict_status >= 16) { $dict_status -= 16; $place_name = "place" } else { $place_name = "no"; }
|
1770
|
+
if ($dict_status >= 8) { $dict_status -= 8; $month_name = "month" } else { $month_name = "no"; }
|
1771
|
+
if ($dict_status >= 4) { $dict_status -= 4; $last_name = "last" } else { $last_name = "no"; }
|
1772
|
+
if ($dict_status >= 2) { $dict_status -= 2; $female_name = "female" } else { $female_name = "no"; }
|
1773
|
+
if ($dict_status >= 1) { $dict_status -= 1; $male_name = "male" } else { $male_name = "no"; }
|
1774
|
+
# Save the feature
|
1775
|
+
$features .= $male_name . "\t";
|
1776
|
+
$features .= $female_name . "\t";
|
1777
|
+
$features .= $last_name . "\t";
|
1778
|
+
$features .= $month_name . "\t";
|
1779
|
+
$features .= $place_name . "\t";
|
1780
|
+
$features .= $publisher_name . "\t";
|
1781
|
+
|
1782
|
+
# First word in line
|
1783
|
+
if ($is_first_line == 1)
|
1784
|
+
{
|
1785
|
+
$features .= "begin" . "\t";
|
1786
|
+
|
1787
|
+
# Next words are not the first in line anymore
|
1788
|
+
$is_first_line = 0;
|
1789
|
+
}
|
1790
|
+
else
|
1791
|
+
{
|
1792
|
+
$features .= "continue" . "\t";
|
1793
|
+
}
|
1794
|
+
|
1795
|
+
###
|
1796
|
+
# The following features are XML features
|
1797
|
+
###
|
1798
|
+
|
1799
|
+
# Bold format
|
1800
|
+
$features .= $bold . "\t";
|
1801
|
+
|
1802
|
+
# Italic format
|
1803
|
+
$features .= $italic . "\t";
|
1804
|
+
|
1805
|
+
# Underline
|
1806
|
+
$features .= $underline . "\t";
|
1807
|
+
|
1808
|
+
# Sub-Sup-script
|
1809
|
+
$features .= $suscript . "\t";
|
1810
|
+
|
1811
|
+
# Relative font size
|
1812
|
+
$features .= $fontsize . "\t";
|
1813
|
+
|
1814
|
+
# First word in run
|
1815
|
+
if (($prev_bold ne $bold) || ($prev_italic ne $italic) || ($prev_underline ne $underline) || ($prev_suscript ne $suscript) || ($prev_fontsize ne $fontsize))
|
1816
|
+
{
|
1817
|
+
$features .= "fbegin" . "\t";
|
1818
|
+
|
1819
|
+
# Next words are not the first in line anymore
|
1820
|
+
# $is_first_run = 0;
|
1821
|
+
}
|
1822
|
+
else
|
1823
|
+
{
|
1824
|
+
$features .= "fcontinue" . "\t";
|
1825
|
+
}
|
1826
|
+
|
1827
|
+
# New token
|
1828
|
+
$features .= "\n";
|
1829
|
+
|
1830
|
+
# Save the XML format
|
1831
|
+
$prev_bold = $bold;
|
1832
|
+
$prev_italic = $italic;
|
1833
|
+
$prev_underline = $underline;
|
1834
|
+
$prev_suscript = $suscript;
|
1835
|
+
$prev_fontsize = $fontsize;
|
1836
|
+
|
1837
|
+
# NOTE: Relational classifier features
|
1838
|
+
# Content
|
1839
|
+
$rc_features .= $content . "\t";
|
1840
|
+
# Location
|
1841
|
+
$rc_features .= $top . "\t";
|
1842
|
+
$rc_features .= $bottom . "\t";
|
1843
|
+
$rc_features .= $left . "\t";
|
1844
|
+
$rc_features .= $right . "\t";
|
1845
|
+
# Index
|
1846
|
+
if (! defined $size_mismatch)
|
1847
|
+
{
|
1848
|
+
$rc_features .= $aut_addrs->[ $counter ]->{ 'L1' } . "\t";
|
1849
|
+
$rc_features .= $aut_addrs->[ $counter ]->{ 'L2' } . "\t";
|
1850
|
+
$rc_features .= $aut_addrs->[ $counter ]->{ 'L3' } . "\t";
|
1851
|
+
$rc_features .= $aut_addrs->[ $counter ]->{ 'L4' } . "\t";
|
1852
|
+
}
|
1853
|
+
# Done
|
1854
|
+
$rc_features .= "\n";
|
1855
|
+
}
|
1856
|
+
}
|
1857
|
+
}
|
1858
|
+
}
|
1859
|
+
|
1860
|
+
return ($features, $rc_features);
|
1861
|
+
}
|
1862
|
+
|
1863
|
+
sub ReadDict
|
1864
|
+
{
|
1865
|
+
my ($dictfile) = @_;
|
1866
|
+
|
1867
|
+
# Absolute path
|
1868
|
+
my $dictfile_abs = File::Spec->rel2abs($dictfile);
|
1869
|
+
# Dictionary handle
|
1870
|
+
my $dict_handle = undef;
|
1871
|
+
open ($dict_handle, "<:utf8", $dictfile_abs) || die "Could not open dict file $dictfile_abs: $!";
|
1872
|
+
|
1873
|
+
my $mode = 0;
|
1874
|
+
while (<$dict_handle>)
|
1875
|
+
{
|
1876
|
+
if (/^\#\# Male/) { $mode = 1; } # male names
|
1877
|
+
elsif (/^\#\# Female/) { $mode = 2; } # female names
|
1878
|
+
elsif (/^\#\# Last/) { $mode = 4; } # last names
|
1879
|
+
elsif (/^\#\# Chinese/) { $mode = 4; } # last names
|
1880
|
+
elsif (/^\#\# Months/) { $mode = 8; } # month names
|
1881
|
+
elsif (/^\#\# Place/) { $mode = 16; } # place names
|
1882
|
+
elsif (/^\#\# Publisher/) { $mode = 32; } # publisher names
|
1883
|
+
elsif (/^\#/) { next; }
|
1884
|
+
else
|
1885
|
+
{
|
1886
|
+
chop;
|
1887
|
+
my $key = $_;
|
1888
|
+
my $val = 0;
|
1889
|
+
# Has probability
|
1890
|
+
if (/\t/) { ($key,$val) = split (/\t/,$_); }
|
1891
|
+
|
1892
|
+
# Already tagged (some entries may appear in same part of lexicon more than once
|
1893
|
+
if (! exists $dict{ $key })
|
1894
|
+
{
|
1895
|
+
$dict{ $key } = $mode;
|
1896
|
+
}
|
1897
|
+
else
|
1898
|
+
{
|
1899
|
+
if ($dict{ $key } >= $mode)
|
1900
|
+
{
|
1901
|
+
next;
|
1902
|
+
}
|
1903
|
+
# Not yet tagged
|
1904
|
+
else
|
1905
|
+
{
|
1906
|
+
$dict{ $key } += $mode;
|
1907
|
+
}
|
1908
|
+
}
|
1909
|
+
}
|
1910
|
+
}
|
1911
|
+
|
1912
|
+
close ($dict_handle);
|
1913
|
+
}
|
1914
|
+
|
1915
|
+
sub BuildTmpFile
|
1916
|
+
{
|
1917
|
+
my ($filename) = @_;
|
1918
|
+
|
1919
|
+
my $tmpfile = $filename;
|
1920
|
+
$tmpfile =~ s/[\.\/]//g;
|
1921
|
+
$tmpfile .= $$ . time;
|
1922
|
+
|
1923
|
+
# Untaint tmpfile variable
|
1924
|
+
if ($tmpfile =~ /^([-\@\w.]+)$/)
|
1925
|
+
{
|
1926
|
+
$tmpfile = $1;
|
1927
|
+
}
|
1928
|
+
|
1929
|
+
return "/tmp/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
|
1930
|
+
}
|
1931
|
+
|
1932
|
+
1;
|
1933
|
+
|
1934
|
+
|
1935
|
+
|
1936
|
+
|
1937
|
+
|
1938
|
+
|
1939
|
+
|
1940
|
+
|
1941
|
+
|
1942
|
+
|
1943
|
+
|
1944
|
+
|
1945
|
+
|
1946
|
+
|
1947
|
+
|
1948
|
+
|
1949
|
+
|