biblicit 2.0.5 → 2.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1949 @@
1
+ package SectLabel::AAMatching;
2
+
3
+ ###
4
+ # This package provides methods to solve the matching problem
5
+ # between author and affiliation in a pdf
6
+ #
7
+ # Do Hoang Nhat Huy 21 Apr, 11
8
+ ###
9
+
10
+ use strict;
11
+
12
+ # Dependencies
13
+ use POSIX;
14
+ use IO::File;
15
+ use XML::Writer;
16
+ use XML::Writer::String;
17
+
18
+ use Class::Struct;
19
+
20
+ # Local libraries
21
+ use SectLabel::Config;
22
+ use ParsCit::PostProcess;
23
+
24
+ # Dictionary
25
+ my %dict = ();
26
+ # CRF++
27
+ my $crft = $ENV{'CRFPP_HOME'} ? "$ENV{'CRFPP_HOME'}/bin/crf_test" : "$FindBin::Bin/../$SectLabel::Config::crf_test";
28
+
29
+ # Matching features of each author, including
30
+ # Signals
31
+ # Coordinations: top, bottom, left, right
32
+ # Position: page, sections, paragraph, line
33
+ struct aut_rcfeatures =>
34
+ {
35
+ signals => '@',
36
+
37
+ top => '$',
38
+ bottom => '$',
39
+ left => '$',
40
+ right => '$',
41
+
42
+ page => '$',
43
+ section => '$',
44
+ para => '$',
45
+ line => '$'
46
+ };
47
+
48
+ # Matching features of each affiliation, including
49
+ # Signals
50
+ # Coordinations: top, bottom, left, right
51
+ # Position: page, sections, paragraph, line
52
+ struct aff_rcfeatures =>
53
+ {
54
+ signals => '@',
55
+
56
+ top => '$',
57
+ bottom => '$',
58
+ left => '$',
59
+ right => '$',
60
+
61
+ page => '$',
62
+ section => '$',
63
+ para => '$',
64
+ line => '$'
65
+ };
66
+
67
+ # Author
68
+ # Affiliation
69
+ sub AAMatching
70
+ {
71
+ my ($doc, $aut_addrs, $aff_addrs) = @_;
72
+
73
+ my $need_object = 1;
74
+ # Get the author objects
75
+ my $aut_lines = Omni::Traversal::OmniCollector($doc, $aut_addrs, $need_object);
76
+ # Get the affiliation objects
77
+ my $aff_lines = Omni::Traversal::OmniCollector($doc, $aff_addrs, $need_object);
78
+
79
+ # Dictionary
80
+ ReadDict($FindBin::Bin . "/../" . $SectLabel::Config::dictFile);
81
+
82
+ # Authors
83
+ my ($aut_features, $aut_rc_features) = AuthorFeatureExtraction($aut_lines, $aut_addrs);
84
+ # Call CRF
85
+ my ($aut_signal, $aut_rc) = AuthorExtraction($aut_features, $aut_rc_features);
86
+
87
+ # Affiliations
88
+ my ($aff_features, $aff_rc_features) = AffiliationFeatureExtraction($aff_lines, $aff_addrs);
89
+ # Call CRF
90
+ my ($aff_signal, $aff_rc, $affs) = AffiliationExtraction($aff_features, $aff_rc_features);
91
+
92
+ # Matching features
93
+ my $aa_features = AAFeatureExtraction($aut_rc, $aff_rc);
94
+ # Matching
95
+ my $aa = AAMatchingImp($aa_features);
96
+
97
+ =pod
98
+ # DEBUG
99
+ my $aut_handle = undef;
100
+ my $aff_handle = undef;
101
+ my $aau_handle = undef;
102
+ my $aaf_handle = undef;
103
+ my $aut_debug = undef;
104
+ my $aff_debug = undef;
105
+ my $aa_handle = undef;
106
+
107
+ open $aut_handle, ">:utf8", "aut.features";
108
+ open $aff_handle, ">:utf8", "aff.features";
109
+ open $aau_handle, ">:utf8", "aau.features";
110
+ open $aaf_handle, ">:utf8", "aaf.features";
111
+ open $aut_debug, ">:utf8", "aut.debug.features";
112
+ open $aff_debug, ">:utf8", "aff.debug.features";
113
+ open $aa_handle, ">:utf8", "aa.features";
114
+
115
+ print $aut_handle $aut_features;
116
+ print $aff_handle $aff_features;
117
+ print $aau_handle $aut_rc_features;
118
+ print $aaf_handle $aff_rc_features;
119
+ print $aa_handle $aa_features, "\n";
120
+
121
+ foreach my $author (keys %{ $aut_rc } )
122
+ {
123
+ print $aut_debug $author, ": ", "\n";
124
+
125
+ foreach my $signal (@{ $aut_rc->{ $author }->signals })
126
+ {
127
+ print $aut_debug "\t", $signal, "\n";
128
+ }
129
+
130
+ print $aut_debug "\t", $aut_rc->{ $author }->top, "\n";
131
+ print $aut_debug "\t", $aut_rc->{ $author }->bottom, "\n";
132
+ print $aut_debug "\t", $aut_rc->{ $author }->left, "\n";
133
+ print $aut_debug "\t", $aut_rc->{ $author }->right, "\n";
134
+
135
+ print $aut_debug "\t", $aut_rc->{ $author }->page, "\n";
136
+ print $aut_debug "\t", $aut_rc->{ $author }->section, "\n";
137
+ print $aut_debug "\t", $aut_rc->{ $author }->para, "\n";
138
+ print $aut_debug "\t", $aut_rc->{ $author }->line, "\n";
139
+ }
140
+
141
+ foreach my $affiliation (keys %{ $aff_rc } )
142
+ {
143
+ print $aff_debug $affiliation, ": ", "\n";
144
+
145
+ foreach my $signal (@{ $aff_rc->{ $affiliation }->signals })
146
+ {
147
+ print $aff_debug "\t", $signal, "\n";
148
+ }
149
+
150
+ print $aff_debug "\t", $aff_rc->{ $affiliation }->top, "\n";
151
+ print $aff_debug "\t", $aff_rc->{ $affiliation }->bottom, "\n";
152
+ print $aff_debug "\t", $aff_rc->{ $affiliation }->left, "\n";
153
+ print $aff_debug "\t", $aff_rc->{ $affiliation }->right, "\n";
154
+
155
+ print $aff_debug "\t", $aff_rc->{ $affiliation }->page, "\n";
156
+ print $aff_debug "\t", $aff_rc->{ $affiliation }->section, "\n";
157
+ print $aff_debug "\t", $aff_rc->{ $affiliation }->para, "\n";
158
+ print $aff_debug "\t", $aff_rc->{ $affiliation }->line, "\n";
159
+ }
160
+
161
+ close $aut_handle;
162
+ close $aff_handle;
163
+ close $aau_handle;
164
+ close $aaf_handle;
165
+ close $aut_debug;
166
+ close $aff_debug;
167
+ close $aa_handle;
168
+ # END
169
+ =cut
170
+
171
+ # Do the matching
172
+ # XML string
173
+ my $sxml = "";
174
+ # and XML writer
175
+ my $writer = new XML::Writer(OUTPUT => \$sxml, ENCODING => 'utf-8', DATA_MODE => 'true', DATA_INDENT => 2);
176
+
177
+ # Algorithm
178
+ $writer->startTag("algorithm", "name" => "AAMatching", "version" => $SectLabel::Config::algorithmVersion);
179
+
180
+ # XML header
181
+ my $date = `date`; chomp($date);
182
+ my $time = `date +%s`; chomp($time);
183
+ # Write XML header
184
+ $writer->startTag("results", "time" => $time, "date" => $date);
185
+
186
+ # Write authors
187
+ $writer->startTag("authors");
188
+
189
+ # Write the author name and his corresponding institution
190
+ foreach my $author (keys %{ $aut_signal })
191
+ {
192
+ $writer->startTag("author");
193
+
194
+ $writer->startTag("fullname", "source" => "parscit");
195
+ $writer->characters($author);
196
+ $writer->endTag("fullname");
197
+
198
+ $writer->startTag("institutions");
199
+ =pod
200
+ foreach my $signal (@{ $aut_signal->{ $author } })
201
+ {
202
+ $signal =~ s/^\s+|\s+$//g;
203
+ # Skip blank
204
+ if ($signal eq "") { next; }
205
+
206
+ $writer->startTag("institution", "symbol" => $signal);
207
+ $writer->characters($aff_signal->{ $signal });
208
+ $writer->endTag("institution");
209
+ }
210
+ =cut
211
+
212
+ foreach my $affiliation (@{ $aa->{ $author } })
213
+ {
214
+ $writer->startTag("institution");
215
+ $writer->characters($affiliation);
216
+ $writer->endTag("institution");
217
+ }
218
+
219
+ $writer->endTag("institutions");
220
+
221
+ $writer->endTag("author");
222
+ }
223
+
224
+ # Finish authors
225
+ $writer->endTag("authors");
226
+
227
+ # Write institutions
228
+ $writer->startTag("institutions");
229
+
230
+ # Write the instituion name
231
+ foreach my $institute (@{ $affs })
232
+ {
233
+ $writer->startTag("institution");
234
+ $writer->characters($institute);
235
+ $writer->endTag("institution");
236
+ }
237
+
238
+ $writer->endTag("institutions");
239
+
240
+ # Done
241
+ $writer->endTag("results");
242
+ # Done
243
+ $writer->endTag("algorithm");
244
+ # Done
245
+ $writer->end();
246
+
247
+ # Return the xml content back to the caller
248
+ return $sxml;
249
+ }
250
+
251
+ # Features of the relational classifier between author and affiliation
252
+ sub AAFeatureExtraction
253
+ {
254
+ my ($aut_rc, $aff_rc) = @_;
255
+
256
+ # Relational features
257
+ my $features = "";
258
+
259
+ # Features between x authors
260
+ foreach my $author (keys %{ $aut_rc })
261
+ {
262
+ my @aut_tokens = split /\s/, $author;
263
+ my $author_nb = join '|||', @aut_tokens;
264
+
265
+ my $min_aff_x = undef;
266
+ my $min_dist_x = LONG_MAX;
267
+ my $min_aff_y = undef;
268
+ my $min_dist_y = LONG_MAX;
269
+ # Find the nearest affiliation
270
+ foreach my $aff (keys %{ $aff_rc })
271
+ {
272
+ my $aut_x = ($aut_rc->{ $author }->left + $aut_rc->{ $author }->right) / 2;
273
+ my $aut_y = ($aut_rc->{ $author }->top + $aut_rc->{ $author }->bottom) / 2;
274
+
275
+ my $aff_x = ($aff_rc->{ $aff }->left + $aff_rc->{ $aff }->right) / 2;
276
+ my $aff_y = ($aff_rc->{ $aff }->top + $aff_rc->{ $aff }->bottom) / 2;
277
+
278
+ my $dis_x = abs( $aut_x - $aff_x );
279
+ my $dis_y = abs( $aut_y - $aff_y );
280
+ # Distance between an author and an affiliation
281
+ # my $distance = sqrt( $dis_x * $dis_x + $dis_y * $dis_y );
282
+
283
+ # Check if it it the minimum distance in x axis
284
+ if ($dis_x < $min_dist_x)
285
+ {
286
+ $min_dist_x = $dis_x;
287
+ $min_aff_x = $aff;
288
+ }
289
+
290
+ # Check if it it the minimum distance in y axis
291
+ if ($dis_y < $min_dist_y)
292
+ {
293
+ $min_dist_y = $dis_y;
294
+ $min_aff_y = $aff;
295
+ }
296
+ }
297
+
298
+ # and y affiliation
299
+ foreach my $aff (keys %{ $aff_rc })
300
+ {
301
+ my @aff_tokens = split /\s/, $aff;
302
+ my $aff_nb = join '|||', @aff_tokens;
303
+
304
+ # Content
305
+ $features .= $author_nb . "#" . $aff_nb . "\t";
306
+
307
+ my $signal = undef;
308
+ # Signal
309
+ if ((scalar(@{ $aut_rc->{ $author }->signals }) == 0) || (scalar(@{ $aff_rc->{ $aff }->signals }) == 0))
310
+ {
311
+ $signal = "diff";
312
+ }
313
+ else
314
+ {
315
+ my $matched = undef;
316
+ # Check each author signal
317
+ foreach my $aut_sig (@{ $aut_rc->{ $author }->signals })
318
+ {
319
+ # if it match with affiliation signal
320
+ if ($aut_sig eq ${ $aff_rc->{ $aff }->signals }[ 0 ]) { $matched = 1; last; }
321
+ }
322
+
323
+ $signal = (! defined $matched) ? "diff" : "same";
324
+ }
325
+
326
+ # Signal
327
+ $features .= $signal . "\t";
328
+
329
+ # Same page
330
+ my $page = ($aut_rc->{ $author }->page == $aff_rc->{ $aff }->page) ? "yes" : "no";
331
+ $features .= $page . "\t";
332
+
333
+ my $section = undef;
334
+ # Same section
335
+ if ($page eq "yes")
336
+ {
337
+ $section = ($aut_rc->{ $author }->section == $aff_rc->{ $aff }->section) ? "yes" : "no";
338
+ $features .= $section . "\t";
339
+ }
340
+ else
341
+ {
342
+ $section = "no";
343
+ $features .= $section . "\t";
344
+ }
345
+
346
+ my $para = undef;
347
+ # Same paragraph
348
+ if (($page eq "yes") && ($section eq "yes"))
349
+ {
350
+ $para = ($aut_rc->{ $author }->para == $aff_rc->{ $aff }->para) ? "yes" : "no";
351
+ $features .= $para . "\t";
352
+ }
353
+ else
354
+ {
355
+ $para = "no";
356
+ $features .= $para . "\t";
357
+ }
358
+
359
+ my $line = undef;
360
+ # Same line
361
+ if (($page eq "yes") && ($section eq "yes") && ($para eq "yes"))
362
+ {
363
+ $line = ($aut_rc->{ $author }->line == $aff_rc->{ $aff }->line) ? "yes" : "no";
364
+ $features .= $line . "\t";
365
+ }
366
+ else
367
+ {
368
+ $line = "no";
369
+ $features .= $line . "\t";
370
+ }
371
+
372
+ # Is neartest affiliation in x axis ?
373
+ my $nearest_x = ($aff eq $min_aff_x) ? "yes" : "no";
374
+ $features .= $nearest_x . "\t";
375
+
376
+ # Is neartest affiliation in y axis ?
377
+ my $nearest_y = ($aff eq $min_aff_y) ? "yes" : "no";
378
+ $features .= $nearest_y . "\n";
379
+ }
380
+ }
381
+
382
+ return $features;
383
+ }
384
+
385
+ # Actually do the matching between author and affiliation
386
+ sub AAMatchingImp
387
+ {
388
+ my ($features) = @_;
389
+
390
+ # Temporary input file for CRF
391
+ my $infile = BuildTmpFile("aa-input");
392
+ # Temporary output file for CRF
393
+ my $outfile = BuildTmpFile("aa-output");
394
+
395
+ my $output_handle = undef;
396
+ # Split and write to temporary input
397
+ open $output_handle, ">:utf8", $infile;
398
+ # Split
399
+ my @lines = split /\n/, $features;
400
+ # and write
401
+ foreach my $line (@lines)
402
+ {
403
+ if ($line eq "")
404
+ {
405
+ print $output_handle "\n";
406
+ }
407
+ else
408
+ {
409
+ print $output_handle $line, "\t", "no", "\n";
410
+ }
411
+ }
412
+ # Done
413
+ close $output_handle;
414
+
415
+ # AA matching model
416
+ my $match_model = $SectLabel::Config::matFile;
417
+
418
+ # Matching
419
+ system("$crft -m $match_model $infile > $outfile");
420
+
421
+ # List of authors and their affiliation (if exists)
422
+ my %aa = ();
423
+
424
+ my $input_handle = undef;
425
+ # Read the CRF output
426
+ open $input_handle, "<:utf8", $outfile;
427
+ # Read each line and get its label
428
+ while (<$input_handle>)
429
+ {
430
+ my $line = $_;
431
+ # Trim
432
+ $line =~ s/^\s+|\s+$//g;
433
+ # Blank linem, what the heck ?
434
+ if ($line eq "") { next; }
435
+
436
+ # Split the line
437
+ my @fields = split /\t/, $line;
438
+ # and extract the class and the content
439
+ my $class = $fields[ -1 ];
440
+ my $content = $fields[ 0 ];
441
+
442
+ # You miss
443
+ if ($class ne "yes") { next; }
444
+
445
+ # Split the content into author name and affiliation name
446
+ my @tmp = split /#/, $content;
447
+ # Author name
448
+ my $author = $tmp[ 0 ];
449
+ $author =~ s/\|\|\|/ /g;
450
+ # Affiliation name
451
+ my $aff = $tmp[ 1 ];
452
+ $aff =~ s/\|\|\|/ /g;
453
+
454
+ # Save
455
+ if (! exists $aa{ $author }) { $aa{ $author } = (); }
456
+ # Save
457
+ push @{ $aa{ $author } }, $aff;
458
+ }
459
+
460
+ # Done
461
+ close $input_handle;
462
+
463
+ # Clean up
464
+ unlink $infile;
465
+ unlink $outfile;
466
+ # Done
467
+ return (\%aa);
468
+ }
469
+
470
+ # Extract affiliation and their signal using crf
471
+ sub AffiliationExtraction
472
+ {
473
+ my ($features, $rc_features) = @_;
474
+
475
+ # Temporary input file for CRF
476
+ my $infile = BuildTmpFile("aff-input");
477
+ # Temporary output file for CRF
478
+ my $outfile = BuildTmpFile("aff-output");
479
+
480
+ my $output_handle = undef;
481
+ # Split and write to temporary input
482
+ open $output_handle, ">:utf8", $infile;
483
+ # Split
484
+ my @lines = split /\n/, $features;
485
+ # and write
486
+ foreach my $line (@lines)
487
+ {
488
+ if ($line eq "")
489
+ {
490
+ print $output_handle "\n";
491
+ }
492
+ else
493
+ {
494
+ print $output_handle $line, "\t", "affiliation", "\n";
495
+ }
496
+ }
497
+ # Done
498
+ close $output_handle;
499
+
500
+ # Author model
501
+ my $aff_model = $SectLabel::Config::affFile;
502
+
503
+ # Split the authors
504
+ system("$crft -m $aff_model $infile > $outfile");
505
+
506
+ # Each affiliation can have only one signal
507
+ my %asg = ();
508
+ # Each affilitiaon can have only one struct
509
+ my %aaf = ();
510
+ # List of all affiliations
511
+ my @aff = ();
512
+
513
+ # Each line in the relational features string
514
+ my @rc_lines = split /\n/, $rc_features;
515
+
516
+ my $input_handle = undef;
517
+ # Read the CRF output
518
+ open $input_handle, "<:utf8", $outfile;
519
+ # Author and signal string
520
+ my $prev_class = "";
521
+ my @aff_str = ();
522
+ my $signal_str = "";
523
+ # Relational classifier
524
+ my @aaf_rc = ();
525
+ # Line counter
526
+ my $counter = 0;
527
+ # Next to last signal
528
+ my $ntl_signal = "";
529
+ # Read each line and get its label
530
+ # TODO: The code assumes that an affiliation will have the following format: 1 foobar institute
531
+ while (<$input_handle>)
532
+ {
533
+ my $line = $_;
534
+ # Trim
535
+ $line =~ s/^\s+|\s+$//g;
536
+ # Blank line mark the end of an affiliation section
537
+ if ($line eq "")
538
+ {
539
+ if ($prev_class eq "affiliation")
540
+ {
541
+ my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
542
+ # Save the affiliation
543
+ push @aff, $affiliation;
544
+ # and its signal
545
+ if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }
546
+
547
+ # Save the signal
548
+ push @{ $rcs->signals }, $ntl_signal;
549
+ # Save the record
550
+ $aaf{ $affiliation } = $rcs;
551
+ }
552
+ elsif ($prev_class eq "signal")
553
+ {
554
+ # Save the next to last signal
555
+ $ntl_signal = NormalizeAffiliationSignal($signal_str);
556
+ }
557
+
558
+ # Cleanup
559
+ $ntl_signal = "";
560
+ # Cleanup
561
+ @aff_str = ();
562
+ $signal_str = "";
563
+ $prev_class = "";
564
+ # Cleanup
565
+ @aaf_rc = ();
566
+
567
+ # Update the counter
568
+ $counter++;
569
+
570
+ next ;
571
+ }
572
+
573
+ # Split the line
574
+ my @fields = split /\t/, $line;
575
+ # and extract the class and the content
576
+ my $class = $fields[ -1 ];
577
+ my $content = $fields[ 0 ];
578
+
579
+ if ($class eq $prev_class)
580
+ {
581
+ # An affiliation
582
+ if ($class eq "affiliation")
583
+ {
584
+ push @aff_str, $content;
585
+ push @aaf_rc, $rc_lines[ $counter ];
586
+ }
587
+ # A signal
588
+ elsif ($class eq "signal")
589
+ {
590
+ $signal_str .= $content . " ";
591
+ }
592
+ }
593
+ else
594
+ {
595
+ if ($prev_class eq "affiliation")
596
+ {
597
+ my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
598
+ # Save the affiliation
599
+ push @aff, $affiliation;
600
+ # and its signal
601
+ if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }
602
+
603
+ # Save the signal
604
+ push @{ $rcs->signals }, $ntl_signal;
605
+ # Save the record
606
+ $aaf{ $affiliation } = $rcs;
607
+
608
+ }
609
+ elsif ($prev_class eq "signal")
610
+ {
611
+ # Save the next to last signal
612
+ $ntl_signal = NormalizeAffiliationSignal($signal_str);
613
+ }
614
+
615
+ # Cleanup
616
+ @aff_str = ();
617
+ $signal_str = "";
618
+ @aaf_rc = ();
619
+ # Switch to the current class
620
+ $prev_class = $class;
621
+
622
+ if ($class eq "affiliation")
623
+ {
624
+ push @aff_str, $content;
625
+ push @aaf_rc, $rc_lines[ $counter ];
626
+ }
627
+ elsif ($class eq "signal")
628
+ {
629
+ $signal_str .= $content . " ";
630
+ }
631
+ }
632
+
633
+ # Update the counter
634
+ $counter++;
635
+ }
636
+
637
+ # Final class
638
+ if ($prev_class eq "affiliation")
639
+ {
640
+ my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
641
+ # Save the affiliation
642
+ push @aff, $affiliation;
643
+ # and its signal
644
+ if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }
645
+
646
+ # Save the signal
647
+ push @{ $rcs->signals }, $ntl_signal;
648
+ # Save the record
649
+ $aaf{ $affiliation } = $rcs;
650
+ }
651
+ elsif ($prev_class eq "signal")
652
+ {
653
+ # Save the next to last signal
654
+ $ntl_signal = NormalizeAffiliationSignal($signal_str);
655
+ }
656
+
657
+ # Done
658
+ close $input_handle;
659
+
660
+ # Clean up
661
+ unlink $infile;
662
+ unlink $outfile;
663
+ # Done
664
+ return (\%asg, \%aaf, \@aff);
665
+ }
666
+
667
+ sub NormalizeAffiliationSignal
668
+ {
669
+ my ($signal_str) = @_;
670
+
671
+ # Trim
672
+ $signal_str =~ s/^\s+|\s+$//g;
673
+ # Remove all space inside the signature
674
+ $signal_str =~ s/\s+//g;
675
+
676
+ # Done
677
+ return $signal_str;
678
+ }
679
+
680
+ sub NormalizeAffiliationName
681
+ {
682
+ my ($aff_str, $aaf_rc) = @_;
683
+
684
+ # Constraint
685
+ if (scalar(@{ $aff_str }) != scalar(@{ $aaf_rc })) { print STDERR "# It cannot happen, if you encounter it, please consider report it as a bug", "\n"; die; }
686
+
687
+ # Affiliation string
688
+ my $affiliation = join ' ', @{ $aff_str };
689
+
690
+ # First word
691
+ my @fields = split /\s/, $aaf_rc->[ 0 ];
692
+ # Save the relational features of an affiliation (its first word)
693
+ my $rcs = aff_rcfeatures->new( signals => [],
694
+ top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
695
+ page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ] );
696
+ # Done
697
+ return ($affiliation, $rcs);
698
+ }
699
+
700
+ # Extract author name and their signal using crf
701
+ sub AuthorExtraction
702
+ {
703
+ my ($features, $rc_features) = @_;
704
+
705
+ # Temporary input file for CRF
706
+ my $infile = BuildTmpFile("aut-input");
707
+ # Temporary output file for CRF
708
+ my $outfile = BuildTmpFile("aut-output");
709
+
710
+ my $output_handle = undef;
711
+ # Split and write to temporary input
712
+ open $output_handle, ">:utf8", $infile;
713
+ # Split
714
+ my @lines = split /\n/, $features;
715
+ # and write
716
+ foreach my $line (@lines)
717
+ {
718
+ if ($line eq "")
719
+ {
720
+ print $output_handle "\n";
721
+ }
722
+ else
723
+ {
724
+ print $output_handle $line, "\t", "ns", "\n";
725
+ }
726
+ }
727
+ # Done
728
+ close $output_handle;
729
+
730
+ # Author model
731
+ my $author_model = $SectLabel::Config::autFile;
732
+
733
+ # Split the authors
734
+ system("$crft -m $author_model $infile > $outfile");
735
+
736
+ # Each author can have one or more signals
737
+ my %asg = ();
738
+ # Each author can have only one struct
739
+ my %aas = ();
740
+
741
+ # Each line in the relational features string
742
+ my @rc_lines = split /\n/, $rc_features;
743
+
744
+ my $input_handle = undef;
745
+ # Read the CRF output
746
+ open $input_handle, "<:utf8", $outfile;
747
+ # Author and signal string
748
+ my $prev_class = "";
749
+ my @author_str = ();
750
+ my $signal_str = "";
751
+ # Relational classifier
752
+ my @author_rc = ();
753
+ # Line counter
754
+ my $counter = 0;
755
+ # Next to last authors
756
+ my %ntl_asg = ();
757
+ #
758
+ my $is_authors = 0;
759
+ # Read each line and get its label
760
+ while (<$input_handle>)
761
+ {
762
+ my $line = $_;
763
+ # Trim
764
+ $line =~ s/^\s+|\s+$//g;
765
+ # Blank line mark the end of an author section
766
+ if ($line eq "")
767
+ {
768
+ if ($prev_class eq "author")
769
+ {
770
+ my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
771
+ # Save each author
772
+ for (my $i = 0; $i < scalar(@{ $authors }); $i++)
773
+ {
774
+ $asg{ $authors->[ $i ] } = ();
775
+ $aas{ $authors->[ $i ] } = $rcs->[ $i ];
776
+ $ntl_asg{ $authors->[ $i ] } = 0;
777
+ }
778
+ }
779
+ elsif ($prev_class eq "signal")
780
+ {
781
+ my $signals = NormalizeAuthorSignal($signal_str);
782
+ # Save each signal to its corresponding author
783
+ foreach my $author (keys %ntl_asg)
784
+ {
785
+ foreach my $signal (@{ $signals })
786
+ {
787
+ push @{ $asg{ $author } }, $signal;
788
+ push @{ $aas{ $author }->signals }, $signal;
789
+ }
790
+ }
791
+ }
792
+
793
+ # Cleanup
794
+ %ntl_asg = ();
795
+ # Cleanup
796
+ @author_str = ();
797
+ $signal_str = "";
798
+ @author_rc = ();
799
+ # Cleanup
800
+ $prev_class = "";
801
+
802
+ # Update the counter
803
+ $counter++;
804
+
805
+ #
806
+ $is_authors = 0;
807
+
808
+ next;
809
+ }
810
+
811
+ # Split the line
812
+ my @fields = split /\t/, $line;
813
+ # and extract the class and the content
814
+ my $class = $fields[ -1 ];
815
+ my $content = $fields[ 0 ];
816
+
817
+ if ($class eq $prev_class)
818
+ {
819
+ # An author
820
+ if ($class eq "author")
821
+ {
822
+ push @author_str, $content;
823
+ push @author_rc, $rc_lines[ $counter ];
824
+ }
825
+ # A signal
826
+ elsif ($class eq "signal")
827
+ {
828
+ $signal_str .= $content . " ";
829
+ }
830
+ }
831
+ else
832
+ {
833
+ if ($prev_class eq "author")
834
+ {
835
+ my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
836
+ # Save each author
837
+ for (my $i = 0; $i < scalar(@{ $authors }); $i++)
838
+ {
839
+ $asg{ $authors->[ $i ] } = ();
840
+ $aas{ $authors->[ $i ] } = $rcs->[ $i ];
841
+ $ntl_asg{ $authors->[ $i ] } = 0;
842
+ }
843
+ }
844
+ elsif ($prev_class eq "signal")
845
+ {
846
+ my $signals = NormalizeAuthorSignal($signal_str);
847
+ # Save each signal to its corresponding author
848
+ foreach my $author (keys %ntl_asg)
849
+ {
850
+ foreach my $signal (@{ $signals })
851
+ {
852
+ push @{ $asg{ $author } }, $signal;
853
+ push @{ $aas{ $author }->signals }, $signal;
854
+ }
855
+ }
856
+ }
857
+
858
+ # Clean the next to last author list if this current class is author
859
+ if (($is_authors == 0) && ($class eq "author")) { %ntl_asg = (); $is_authors = 1; }
860
+ #
861
+ if ($class eq "signal") { $is_authors = 0; }
862
+
863
+ # Cleanup
864
+ @author_str = ();
865
+ $signal_str = "";
866
+ @author_rc = ();
867
+ # Switch to the current class
868
+ $prev_class = $class;
869
+
870
+ if ($class eq "author")
871
+ {
872
+ push @author_str, $content;
873
+ push @author_rc, $rc_lines[ $counter ];
874
+ }
875
+ elsif ($class eq "signal")
876
+ {
877
+ $signal_str .= $content . " ";
878
+ }
879
+ }
880
+
881
+ # Update the counter
882
+ $counter++;
883
+ }
884
+
885
+ # Final class
886
+ if ($prev_class eq "author")
887
+ {
888
+ my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
889
+ # Save each author
890
+ for (my $i = 0; $i < scalar(@{ $authors }); $i++)
891
+ {
892
+ $asg{ $authors->[ $i ] } = ();
893
+ $aas{ $authors->[ $i ] } = $rcs->[ $i ];
894
+ $ntl_asg{ $authors->[ $i ] } = 0;
895
+ }
896
+ }
897
+ elsif ($prev_class eq "signal")
898
+ {
899
+ my $signals = NormalizeAuthorSignal($signal_str);
900
+ # Save each signal to its corresponding author
901
+ foreach my $author (keys %ntl_asg)
902
+ {
903
+ foreach my $signal (@{ $signals })
904
+ {
905
+ push @{ $asg{ $author } }, $signal;
906
+ push @{ $aas{ $author }->signals }, $signal;
907
+ }
908
+ }
909
+ }
910
+
911
+ # Done
912
+ close $input_handle;
913
+
914
+ # Clean up
915
+ unlink $infile;
916
+ unlink $outfile;
917
+ # Done
918
+ return (\%asg, \%aas);
919
+ }
920
+
921
+ sub NormalizeAuthorNames
922
+ {
923
+ my ($author_str, $author_rc) = @_;
924
+
925
+ # Constraint
926
+ if (scalar(@{ $author_str }) != scalar(@{ $author_rc })) { print STDERR "# It cannot happen, if you encounter it, please consider report it as a bug", "\n"; die; }
927
+
928
+ # Mark the beginning of an author name
929
+ my $begin = 1;
930
+ # and its corresponding relational features
931
+ my $rcbegin = 0;
932
+
933
+ my @current = ();
934
+ my @authors = ();
935
+ my @rcs = ();
936
+ # Check all tokens in the author string
937
+ for (my $i = 0; $i < scalar(@{ $author_str }); $i++)
938
+ {
939
+ my $token = $author_str->[ $i ];
940
+
941
+ # Mark the end of an author name
942
+ if ($token =~ m/^(&|and|,|;)$/i)
943
+ {
944
+ if (scalar(@current) != 0)
945
+ {
946
+ push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);
947
+
948
+ # Save the relational features of an author (its first word)
949
+ my @fields = split /\s/, $author_rc->[ $rcbegin ];
950
+ # Create new record
951
+ my $tmp = aut_rcfeatures->new( signals => [],
952
+ top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
953
+ page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ] );
954
+ # Save the record
955
+ push @rcs, $tmp;
956
+ }
957
+
958
+ # Cleanup
959
+ @current = ();
960
+ $begin = 1;
961
+
962
+ next;
963
+ }
964
+
965
+ # Mark the begin of an author name
966
+ if ($begin == 1)
967
+ {
968
+ push @current, $token;
969
+
970
+ $begin = 0;
971
+ $rcbegin = $i;
972
+
973
+ next;
974
+ }
975
+
976
+ # Author name ending with a comma
977
+ if ($token =~ m/,$/)
978
+ {
979
+ push @current, $token;
980
+
981
+ if (scalar(@current) != 0)
982
+ {
983
+ push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);
984
+
985
+ # Save the relational features of an author (its first word)
986
+ my @fields = split /\s/, $author_rc->[ $rcbegin ];
987
+ # Create new record
988
+ my $tmp = aut_rcfeatures->new( signals => [],
989
+ top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
990
+ page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ] );
991
+ # Save the record
992
+ push @rcs, $tmp;
993
+ }
994
+
995
+ # Cleanup
996
+ @current = ();
997
+ $begin = 1;
998
+ }
999
+ # or it's just parts of the name
1000
+ else
1001
+ {
1002
+ push @current, $token;
1003
+ }
1004
+ }
1005
+
1006
+ # Last author name
1007
+ if (scalar(@current) != 0)
1008
+ {
1009
+ push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);
1010
+
1011
+ # Save the relational features of an author (its first word)
1012
+ my @fields = split /\s/, $author_rc->[ $rcbegin ];
1013
+ # Create new record
1014
+ my $tmp = aut_rcfeatures->new( signals => [],
1015
+ top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
1016
+ page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ] );
1017
+ # Save the record
1018
+ push @rcs, $tmp;
1019
+ }
1020
+
1021
+ # Done
1022
+ return (\@authors, \@rcs);
1023
+ }
1024
+
1025
+ #
1026
+ sub NormalizeAuthorSignal
1027
+ {
1028
+ my ($signal_str) = @_;
1029
+
1030
+ # Trim
1031
+ $signal_str =~ s/^\s+|\s+$//g;
1032
+ # Split into individual signal
1033
+ my @signals = split / |,|:|;/, $signal_str;
1034
+
1035
+ # Done
1036
+ return \@signals;
1037
+ }
1038
+
1039
+ # Extract features from affiliation lines
1040
+ # The list of features include
1041
+ # Content
1042
+ # Content, lower case, no punctuation
1043
+ # Content length
1044
+ # First word in line
1045
+ #
1046
+ # XML features
1047
+ # Subscript, superscript
1048
+ # Bold
1049
+ # Italic
1050
+ # Underline
1051
+ # Relative font size
1052
+ # Differentiate features
1053
+ sub AffiliationFeatureExtraction
1054
+ {
1055
+ my ($aff_lines, $aff_addrs) = @_;
1056
+
1057
+ # NOTE: Relational classifier features
1058
+ my $rc_features = "";
1059
+
1060
+ # Features will be stored here
1061
+ my $features = "";
1062
+ # First word in line
1063
+ my $is_first_line = undef;
1064
+
1065
+ # Font size
1066
+ my %fonts = ();
1067
+ # Each line contains many runs
1068
+ foreach my $line (@{ $aff_lines })
1069
+ {
1070
+ my $runs = $line->get_objs_ref();
1071
+ # Iterator though all work in all lines
1072
+ foreach my $run (@{ $runs })
1073
+ {
1074
+ my $fsize = $run->get_font_size();
1075
+ my $words = $run->get_objs_ref();
1076
+
1077
+ # Statistic
1078
+ if (! exists $fonts{ $fsize })
1079
+ {
1080
+ $fonts{ $fsize } = scalar(@{ $words });
1081
+ }
1082
+ else
1083
+ {
1084
+ $fonts{ $fsize } += scalar(@{ $words });
1085
+ }
1086
+ }
1087
+ }
1088
+
1089
+ my $dominate_font = undef;
1090
+ # Sort all the font descend with the number of their appearance
1091
+ my @sorted = sort { $fonts{ $b } <=> $fonts{ $a } } keys %fonts;
1092
+ # Select the dominated font
1093
+ $dominate_font = $sorted[ 0 ];
1094
+
1095
+ my $size_mismatch = undef;
1096
+ # TODO: serious error if the size of aff_lines and the size of aff_addrs mismatch
1097
+ if (scalar(@{ $aff_lines }) != scalar(@{ $aff_addrs }))
1098
+ {
1099
+ $size_mismatch = 1;
1100
+ # Print the error but still try to continue
1101
+ print STDERR "# Total number of affiliation lines (" . scalar(@{ $aff_lines }) . ") != Total number of affiliation addresses (" . scalar(@{ $aff_addrs }) . ")." . "\n";
1102
+ }
1103
+
1104
+ my $prev_page = undef;
1105
+ my $prev_sect = undef;
1106
+ my $prev_para = undef;
1107
+ # Each line contains many runs
1108
+ for (my $counter = 0; $counter < scalar(@{ $aff_lines }); $counter++)
1109
+ {
1110
+ # Get the line object
1111
+ my $line = $aff_lines->[ $counter ];
1112
+
1113
+ # Check the size of aff_lines and aff_addrs
1114
+ if (! defined $size_mismatch)
1115
+ {
1116
+ # Check if two consecutive lines are from two different sections
1117
+ if (! defined $prev_page)
1118
+ {
1119
+ # Init
1120
+ $prev_page = $aff_addrs->[ $counter ]->{ 'L1' };
1121
+ $prev_sect = $aff_addrs->[ $counter ]->{ 'L2' };
1122
+ $prev_para = $aff_addrs->[ $counter ]->{ 'L3' };
1123
+ }
1124
+ else
1125
+ {
1126
+ # Affiliations from different sections will be separated immediately
1127
+ if (($prev_page != $aff_addrs->[ $counter ]->{ 'L1' }) ||
1128
+ ($prev_sect != $aff_addrs->[ $counter ]->{ 'L2' }) ||
1129
+ ($prev_para != $aff_addrs->[ $counter ]->{ 'L3' }))
1130
+ {
1131
+ $features .= "\n";
1132
+
1133
+ # NOTE: Relational classifier features
1134
+ $rc_features .= "\n";
1135
+ }
1136
+
1137
+ # Save the paragraph index
1138
+ $prev_page = $aff_addrs->[ $counter ]->{ 'L1' };
1139
+ $prev_sect = $aff_addrs->[ $counter ]->{ 'L2' };
1140
+ $prev_para = $aff_addrs->[ $counter ]->{ 'L3' };
1141
+ }
1142
+ }
1143
+
1144
+ # Set first word in line
1145
+ $is_first_line = 1;
1146
+
1147
+ # Two previous words
1148
+ my $prev_word = undef;
1149
+ my $prev_prev_word = undef;
1150
+
1151
+ # Format of the previous word
1152
+ my ($prev_bold, $prev_italic, $prev_underline, $prev_suscript, $prev_fontsize) = "unknown";
1153
+
1154
+ my $runs = $line->get_objs_ref();
1155
+ # Iterator though all work in all lines
1156
+ foreach my $run (@{ $runs })
1157
+ {
1158
+ # The run must be non-empty
1159
+ my $tmp = $run->get_content();
1160
+ # Trim
1161
+ $tmp =~ s/^\s+|\s+$//g;
1162
+ # Skip blank run
1163
+ if ($tmp eq "") { next; }
1164
+
1165
+ ###
1166
+ # The following features are XML features
1167
+ ###
1168
+
1169
+ # Bold format
1170
+ my $bold = ($run->get_bold() eq "true") ? "bold" : "none";
1171
+
1172
+ # Italic format
1173
+ my $italic = ($run->get_italic() eq "true") ? "italic" : "none";
1174
+
1175
+ # Underline
1176
+ my $underline = ($run->get_underline() eq "true") ? "underline" : "none";
1177
+
1178
+ # Sub-Sup-script
1179
+ my $suscript = ($run->get_suscript() eq "superscript") ? "super" :
1180
+ ($run->get_suscript() eq "subscript") ? "sub" : "none";
1181
+
1182
+ # Relative font size
1183
+ my $fontsize = ($run->get_font_size() > $dominate_font) ? "large" :
1184
+ ($run->get_font_size() < $dominate_font) ? "small" : "normal";
1185
+
1186
+ ###
1187
+ # End of XML features
1188
+ ###
1189
+
1190
+ # All words in the run
1191
+ my $words = $run->get_objs_ref();
1192
+
1193
+ # For each word
1194
+ foreach my $word (@{ $words })
1195
+ {
1196
+ # Get word location
1197
+ my $top = $word->get_top_pos();
1198
+ my $bottom = $word->get_bottom_pos();
1199
+ my $left = $word->get_left_pos();
1200
+ my $right = $word->get_right_pos();
1201
+
1202
+ # NOTE: heuristic rule, for words in the same line
1203
+ # If the x-axis distance between this word and the previous word is
1204
+ # three times larger than the distance between the previous word and
1205
+ # the word before it, then it marks the separator.
1206
+ # The better way to do this is to introduce it as a new feature in the
1207
+ # author and affiliation model but this step requires re-training these
1208
+ # two models, so ...
1209
+ #
1210
+ # NOTE: Assuming left to right writing
1211
+ if (! defined $prev_word)
1212
+ {
1213
+ $prev_word = $word;
1214
+ }
1215
+ elsif (! defined $prev_prev_word)
1216
+ {
1217
+ # NOTE: Words have the power to both destroy and heal, when words are both
1218
+ # true and kind, they can change our world
1219
+ if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
1220
+ {
1221
+ $prev_prev_word = $prev_word;
1222
+ $prev_word = $word;
1223
+ }
1224
+ }
1225
+ else
1226
+ {
1227
+ # NOTE: Words have the power to both destroy and heal, when words are both
1228
+ # true and kind, they can change our world
1229
+ if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
1230
+ {
1231
+ my $prev_dist = abs ($prev_word->get_left_pos() - $prev_prev_word->get_right_pos());
1232
+ my $curr_dist = abs ($word->get_left_pos() - $prev_word->get_right_pos());
1233
+
1234
+ if ($prev_dist * 5 < $curr_dist)
1235
+ {
1236
+ $features .= "\n";
1237
+
1238
+ # NOTE: Relational classifier features
1239
+ $rc_features .= "\n";
1240
+ }
1241
+
1242
+ $prev_prev_word = $prev_word;
1243
+ $prev_word = $word;
1244
+ }
1245
+ }
1246
+
1247
+ # Extract features
1248
+ my $full_content = $word->get_content();
1249
+ # Trim
1250
+ $full_content =~ s/^\s+|\s+$//g;
1251
+
1252
+ # Skip blank run
1253
+ if ($full_content eq "") { next; }
1254
+
1255
+ my @sub_content = ();
1256
+ # This is the tricky part, one word e.g. **affiliation will be
1257
+ # splitted into two parts: the signal, and the affiliation if
1258
+ # possible using regular expression
1259
+ while ($full_content =~ m/([\w|-]*)(\W*)/g)
1260
+ {
1261
+ my $first = $1;
1262
+ my $second = $2;
1263
+
1264
+ # Trim
1265
+ $first =~ s/^\s+|\s+$//g;
1266
+ $second =~ s/^\s+|\s+$//g;
1267
+
1268
+ # Only keep non-blank content
1269
+ if ($first ne "") { push @sub_content, $first; }
1270
+
1271
+ # Check the signal and separator
1272
+ while ($second =~ m/([,|\.|:|;]*)([^,\.:;]*)/g)
1273
+ {
1274
+ my $sub_first = $1;
1275
+ my $sub_second = $2;
1276
+
1277
+ # Trim
1278
+ $sub_first =~ s/^\s+|\s+$//g;
1279
+ $sub_second =~ s/^\s+|\s+$//g;
1280
+
1281
+ # Only keep non-blank separator
1282
+ if ($sub_first ne "") { push @sub_content, $sub_first; }
1283
+ # Only keep non-blank signal
1284
+ if ($sub_second ne "") { push @sub_content, $sub_second; }
1285
+ }
1286
+ }
1287
+
1288
+ foreach my $content (@sub_content)
1289
+ {
1290
+ # Content
1291
+ $features .= $content . "\t";
1292
+
1293
+ my $content_n = $content;
1294
+ # Remove punctuation
1295
+ $content_n =~ s/[^\w]//g;
1296
+ # Lower case
1297
+ my $content_l = lc($content);
1298
+ # Lower case, no punctuation
1299
+ my $content_nl = lc($content_n);
1300
+ # Lower case
1301
+ $features .= $content_l . "\t";
1302
+ # Lower case, no punctuation
1303
+ if ($content_nl ne "")
1304
+ {
1305
+ $features .= $content_nl . "\t";
1306
+ }
1307
+ else
1308
+ {
1309
+ $features .= $content_l . "\t";
1310
+ }
1311
+
1312
+ # Split into character
1313
+ my @chars = split(//, $content);
1314
+ # Content length
1315
+ my $length = (scalar(@chars) == 1) ? "1-char" :
1316
+ (scalar(@chars) == 2) ? "2-char" :
1317
+ (scalar(@chars) == 3) ? "3-char" : "4+char";
1318
+ $features .= $length . "\t";
1319
+
1320
+ # First word in line
1321
+ if ($is_first_line == 1)
1322
+ {
1323
+ $features .= "begin" . "\t";
1324
+
1325
+ # Next words are not the first in line anymore
1326
+ $is_first_line = 0;
1327
+ }
1328
+ else
1329
+ {
1330
+ $features .= "continue" . "\t";
1331
+ }
1332
+
1333
+ ###
1334
+ # The following features are XML features
1335
+ ###
1336
+
1337
+ # Bold format
1338
+ $features .= $bold . "\t";
1339
+
1340
+ # Italic format
1341
+ $features .= $italic . "\t";
1342
+
1343
+ # Underline
1344
+ $features .= $underline . "\t";
1345
+
1346
+ # Sub-Sup-script
1347
+ $features .= $suscript . "\t";
1348
+
1349
+ # Relative font size
1350
+ $features .= $fontsize . "\t";
1351
+
1352
+ # First word in run
1353
+ if (($prev_bold ne $bold) || ($prev_italic ne $italic) || ($prev_underline ne $underline) || ($prev_suscript ne $suscript) || ($prev_fontsize ne $fontsize))
1354
+ {
1355
+ $features .= "fbegin" . "\t";
1356
+ }
1357
+ else
1358
+ {
1359
+ $features .= "fcontinue" . "\t";
1360
+ }
1361
+
1362
+ # New token
1363
+ $features .= "\n";
1364
+
1365
+ # Save the XML format
1366
+ $prev_bold = $bold;
1367
+ $prev_italic = $italic;
1368
+ $prev_underline = $underline;
1369
+ $prev_suscript = $suscript;
1370
+ $prev_fontsize = $fontsize;
1371
+
1372
+ # NOTE: Relational classifier features
1373
+ # Content
1374
+ $rc_features .= $content . "\t";
1375
+ # Location
1376
+ $rc_features .= $top . "\t";
1377
+ $rc_features .= $bottom . "\t";
1378
+ $rc_features .= $left . "\t";
1379
+ $rc_features .= $right . "\t";
1380
+ # Index
1381
+ if (! defined $size_mismatch)
1382
+ {
1383
+ $rc_features .= $aff_addrs->[ $counter ]->{ 'L1' } . "\t";
1384
+ $rc_features .= $aff_addrs->[ $counter ]->{ 'L2' } . "\t";
1385
+ $rc_features .= $aff_addrs->[ $counter ]->{ 'L3' } . "\t";
1386
+ $rc_features .= $aff_addrs->[ $counter ]->{ 'L4' } . "\t";
1387
+ }
1388
+ # Done
1389
+ $rc_features .= "\n";
1390
+ }
1391
+ }
1392
+ }
1393
+ }
1394
+
1395
+ return ($features, $rc_features);
1396
+
1397
+ }
1398
+
1399
+ # Extract features from author lines
1400
+ # The list of features include
1401
+ # Content
1402
+ # Content, lower case, no punctuation
1403
+ # Content length
1404
+ # Capitalization
1405
+ # Numeric property
1406
+ # Last punctuation
1407
+ # First 4-gram
1408
+ # Last 4-gram
1409
+ # Dictionary
1410
+ # First word in line
1411
+ #
1412
+ # XML features
1413
+ # Subscript, superscript
1414
+ # Bold
1415
+ # Italic
1416
+ # Underline
1417
+ # Relative font size
1418
+ # Differentiate features
1419
+ sub AuthorFeatureExtraction
1420
+ {
1421
+ my ($aut_lines, $aut_addrs) = @_;
1422
+
1423
+ # NOTE: Relational classifier features
1424
+ my $rc_features = "";
1425
+
1426
+ # Features will be stored here
1427
+ my $features = "";
1428
+ # First word in line
1429
+ my $is_first_line = undef;
1430
+ # First word in run
1431
+ # my $is_first_run = undef;
1432
+
1433
+ # Font size
1434
+ my %fonts = ();
1435
+ # Each line contains many runs
1436
+ foreach my $line (@{ $aut_lines })
1437
+ {
1438
+ my $runs = $line->get_objs_ref();
1439
+ # Iterator though all work in all lines
1440
+ foreach my $run (@{ $runs })
1441
+ {
1442
+ my $fsize = $run->get_font_size();
1443
+ my $words = $run->get_objs_ref();
1444
+
1445
+ # Statistic
1446
+ if (! exists $fonts{ $fsize })
1447
+ {
1448
+ $fonts{ $fsize } = scalar(@{ $words });
1449
+ }
1450
+ else
1451
+ {
1452
+ $fonts{ $fsize } += scalar(@{ $words });
1453
+ }
1454
+ }
1455
+ }
1456
+
1457
+ my $dominate_font = undef;
1458
+ # Sort all the font descend with the number of their appearance
1459
+ my @sorted = sort { $fonts{ $b } <=> $fonts{ $a } } keys %fonts;
1460
+ # Select the dominated font
1461
+ $dominate_font = $sorted[ 0 ];
1462
+
1463
+ my $size_mismatch = undef;
1464
+ # TODO: serious error if the size of aut_lines and the size of aut_addrs mismatch
1465
+ if (scalar(@{ $aut_lines }) != scalar(@{ $aut_addrs }))
1466
+ {
1467
+ $size_mismatch = 1;
1468
+ # Print the error but still try to continue
1469
+ print STDERR "# Total number of author lines (" . scalar(@{ $aut_lines }) . ") != Total number of author addresses (" . scalar(@{ $aut_addrs }) . ")." . "\n";
1470
+ }
1471
+
1472
+ my $prev_page = undef;
1473
+ my $prev_sect = undef;
1474
+ my $prev_para = undef;
1475
+ # Each line contains many runs
1476
+ for (my $counter = 0; $counter < scalar(@{ $aut_lines }); $counter++)
1477
+ {
1478
+ # Get the line object
1479
+ my $line = $aut_lines->[ $counter ];
1480
+
1481
+ # Check the size of aut_line and aut_addrs
1482
+ if (! defined $size_mismatch)
1483
+ {
1484
+ # Check if two consecutive lines are from two different sections
1485
+ if (! defined $prev_page)
1486
+ {
1487
+ # Init
1488
+ $prev_page = $aut_addrs->[ $counter ]->{ 'L1' };
1489
+ $prev_sect = $aut_addrs->[ $counter ]->{ 'L2' };
1490
+ $prev_para = $aut_addrs->[ $counter ]->{ 'L3' };
1491
+ }
1492
+ else
1493
+ {
1494
+ # Authors from different sections will be separated immediately
1495
+ if (($prev_page != $aut_addrs->[ $counter ]->{ 'L1' }) ||
1496
+ ($prev_sect != $aut_addrs->[ $counter ]->{ 'L2' }) ||
1497
+ ($prev_para != $aut_addrs->[ $counter ]->{ 'L3' }))
1498
+ {
1499
+ $features .= "\n";
1500
+
1501
+ # NOTE: Relational classifier features
1502
+ $rc_features .= "\n";
1503
+ }
1504
+
1505
+ # Save the paragraph index
1506
+ $prev_page = $aut_addrs->[ $counter ]->{ 'L1' };
1507
+ $prev_sect = $aut_addrs->[ $counter ]->{ 'L2' };
1508
+ $prev_para = $aut_addrs->[ $counter ]->{ 'L3' };
1509
+ }
1510
+ }
1511
+
1512
+ # Set first word in line
1513
+ $is_first_line = 1;
1514
+
1515
+ # Previous word and the word before this
1516
+ my $prev_prev_word = undef;
1517
+ my $prev_word = undef;
1518
+
1519
+ # Format of the previous word
1520
+ my ($prev_bold, $prev_italic, $prev_underline, $prev_suscript, $prev_fontsize) = "unknown";
1521
+
1522
+ my $runs = $line->get_objs_ref();
1523
+ # Iterator though all work in all lines
1524
+ foreach my $run (@{ $runs })
1525
+ {
1526
+ # The run must be non-empty
1527
+ my $tmp = $run->get_content();
1528
+ # Trim
1529
+ $tmp =~ s/^\s+|\s+$//g;
1530
+ # Skip blank run
1531
+ if ($tmp eq "") { next; }
1532
+
1533
+ # Set first word in run
1534
+ # $is_first_run = 1;
1535
+
1536
+ ###
1537
+ # The following features are XML features
1538
+ ###
1539
+
1540
+ # Bold format
1541
+ my $bold = ($run->get_bold() eq "true") ? "bold" : "none";
1542
+
1543
+ # Italic format
1544
+ my $italic = ($run->get_italic() eq "true") ? "italic" : "none";
1545
+
1546
+ # Underline
1547
+ my $underline = ($run->get_underline() eq "true") ? "underline" : "none";
1548
+
1549
+ # Sub-Sup-script
1550
+ my $suscript = ($run->get_suscript() eq "superscript") ? "super" :
1551
+ ($run->get_suscript() eq "subscript") ? "sub" : "none";
1552
+
1553
+ # Relative font size
1554
+ my $fontsize = ($run->get_font_size() > $dominate_font) ? "large" :
1555
+ ($run->get_font_size() < $dominate_font) ? "small" : "normal";
1556
+
1557
+ ###
1558
+ # End of XML features
1559
+ ###
1560
+
1561
+ # All words in the run
1562
+ my $words = $run->get_objs_ref();
1563
+
1564
+ # For each word
1565
+ foreach my $word (@{ $words })
1566
+ {
1567
+ # Get word location
1568
+ my $top = $word->get_top_pos();
1569
+ my $bottom = $word->get_bottom_pos();
1570
+ my $left = $word->get_left_pos();
1571
+ my $right = $word->get_right_pos();
1572
+
1573
+ # NOTE: heuristic rule, for words in the same line
1574
+ # If the x-axis distance between this word and the previous word is
1575
+ # three times larger than the distance between the previous word and
1576
+ # the word before it, then it marks the separator.
1577
+ # The better way to do this is to introduce it as a new feature in the
1578
+ # author and affiliation model but this step requires re-training these
1579
+ # two models, so ...
1580
+ #
1581
+ # NOTE: Assuming left to right writing
1582
+ if (! defined $prev_word)
1583
+ {
1584
+ $prev_word = $word;
1585
+ }
1586
+ elsif (! defined $prev_prev_word)
1587
+ {
1588
+ # NOTE: Words have the power to both destroy and heal, when words are both
1589
+ # true and kind, they can change our world
1590
+ if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
1591
+ {
1592
+ $prev_prev_word = $prev_word;
1593
+ $prev_word = $word;
1594
+ }
1595
+ }
1596
+ else
1597
+ {
1598
+ # NOTE: Words have the power to both destroy and heal, when words are both
1599
+ # true and kind, they can change our world
1600
+ if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
1601
+ {
1602
+
1603
+ my $prev_dist = abs ($prev_word->get_left_pos() - $prev_prev_word->get_right_pos());
1604
+ my $curr_dist = abs ($word->get_left_pos() - $prev_word->get_right_pos());
1605
+
1606
+ if ($prev_dist * 5 < $curr_dist)
1607
+ {
1608
+ $features .= "\n";
1609
+
1610
+ # NOTE: Relational classifier features
1611
+ $rc_features .= "\n";
1612
+ }
1613
+
1614
+ $prev_prev_word = $prev_word;
1615
+ $prev_word = $word;
1616
+ }
1617
+ }
1618
+
1619
+ # Extract features
1620
+ my $full_content = $word->get_content();
1621
+ # Trim
1622
+ $full_content =~ s/^\s+|\s+$//g;
1623
+
1624
+ # Skip blank run
1625
+ if ($full_content eq "") { next; }
1626
+
1627
+ my @sub_content = ();
1628
+ # This is the tricky part, one word e.g. name** will be splitted
1629
+ # into several parts: the name, the signal, and the separator if
1630
+ # possible using regular expression
1631
+ while ($full_content =~ m/([\w|-]*)(\W*)/g)
1632
+ {
1633
+ my $first = $1;
1634
+ my $second = $2;
1635
+
1636
+ # Trim
1637
+ $first =~ s/^\s+|\s+$//g;
1638
+ $second =~ s/^\s+|\s+$//g;
1639
+
1640
+ # Only keep non-blank content
1641
+ if ($first ne "") { push @sub_content, $first; }
1642
+
1643
+ # Check the signal and separator
1644
+ while ($second =~ m/([,|\.|:|;]*)([^,\.:;]*)/g)
1645
+ {
1646
+ my $sub_first = $1;
1647
+ my $sub_second = $2;
1648
+
1649
+ # Trim
1650
+ $sub_first =~ s/^\s+|\s+$//g;
1651
+ $sub_second =~ s/^\s+|\s+$//g;
1652
+
1653
+ # Only keep non-blank separator
1654
+ if ($sub_first ne "") { push @sub_content, $sub_first; }
1655
+ # Only keep non-blank signal
1656
+ if ($sub_second ne "") { push @sub_content, $sub_second; }
1657
+ }
1658
+ }
1659
+
1660
+ foreach my $content (@sub_content)
1661
+ {
1662
+ # Content
1663
+ $features .= $content . "\t";
1664
+
1665
+ my $content_n = $content;
1666
+ # Remove punctuation
1667
+ $content_n =~ s/[^\w]//g;
1668
+ # Lower case
1669
+ my $content_l = lc($content);
1670
+ # Lower case, no punctuation
1671
+ my $content_nl = lc($content_n);
1672
+ # Lower case
1673
+ $features .= $content_l . "\t";
1674
+ # Lower case, no punctuation
1675
+ if ($content_nl ne "")
1676
+ {
1677
+ $features .= $content_nl . "\t";
1678
+ }
1679
+ else
1680
+ {
1681
+ $features .= $content_l . "\t";
1682
+ }
1683
+
1684
+ # Capitalization
1685
+ my $ortho = ($content =~ /^[\p{IsUpper}]$/) ? "single" :
1686
+ ($content =~ /^[\p{IsUpper}][\p{IsLower}]+/) ? "init" :
1687
+ ($content =~ /^[\p{IsUpper}]+$/) ? "all" : "others";
1688
+ $features .= $ortho . "\t";
1689
+
1690
+ # Numeric property
1691
+ my $num = ($content =~ /^[0-9]$/) ? "1dig" :
1692
+ ($content =~ /^[0-9][0-9]$/) ? "2dig" :
1693
+ ($content =~ /^[0-9][0-9][0-9]$/) ? "3dig" :
1694
+ ($content =~ /^[0-9]+$/) ? "4+dig" :
1695
+ ($content =~ /^[0-9]+(th|st|nd|rd)$/) ? "ordinal" :
1696
+ ($content =~ /[0-9]/) ? "hasdig" : "nonnum";
1697
+ $features .= $num . "\t";
1698
+
1699
+ # Last punctuation
1700
+ my $punct = ($content =~ /^[\"\'\`]/) ? "leadq" :
1701
+ ($content =~ /[\"\'\`][^s]?$/) ? "endq" :
1702
+ ($content =~ /\-.*\-/) ? "multi" :
1703
+ ($content =~ /[\-\,\:\;]$/) ? "cont" :
1704
+ ($content =~ /[\!\?\.\"\']$/) ? "stop" :
1705
+ ($content =~ /^[\(\[\{\<].+[\)\]\}\>].?$/) ? "braces" : "others";
1706
+ $features .= $punct . "\t";
1707
+
1708
+ # Split into character
1709
+ my @chars = split(//, $content);
1710
+ my $clen = scalar @chars;
1711
+ # Content length
1712
+ my $length = (scalar(@chars) == 1) ? "1-char" :
1713
+ (scalar(@chars) == 2) ? "2-char" :
1714
+ (scalar(@chars) == 3) ? "3-char" : "4+char";
1715
+ $features .= $length . "\t";
1716
+ # First n-gram
1717
+ $features .= $chars[ 0 ] . "\t";
1718
+ if ($clen >= 2) {
1719
+ $features .= join("", @chars[ 0..1 ]) . "\t";
1720
+ } else {
1721
+ $features .= $length . "\t";
1722
+ }
1723
+ if ($clen >= 3) {
1724
+ $features .= join("", @chars[ 0..2 ]) . "\t";
1725
+ } elsif ($clen >= 2) {
1726
+ $features .= join("", @chars[ 0..1 ]) . "\t";
1727
+ } else {
1728
+ $features .= $length . "\t";
1729
+ }
1730
+ if ($clen >= 4) {
1731
+ $features .= join("", @chars[ 0..3 ]) . "\t";
1732
+ } elsif ($clen >= 3) {
1733
+ $features .= join("", @chars[ 0..2 ]) . "\t";
1734
+ } elsif ($clen >= 2) {
1735
+ $features .= join("", @chars[ 0..1 ]) . "\t";
1736
+ } else {
1737
+ $features .= $length . "\t";
1738
+ }
1739
+ # Last n-gram
1740
+ $features .= $chars[ -1 ] . "\t";
1741
+ if ($clen >= 2) {
1742
+ $features .= join("", @chars[ -2..-1 ]) . "\t";
1743
+ } else {
1744
+ $features .= $chars[ -1 ] . "\t";
1745
+ }
1746
+ if ($clen >= 3) {
1747
+ $features .= join("", @chars[ -3..-1 ]) . "\t";
1748
+ } elsif ($clen >= 2) {
1749
+ $features .= join("", @chars[ -2..-1 ]) . "\t";
1750
+ } else {
1751
+ $features .= $chars[ -1 ] . "\t";
1752
+ }
1753
+ if ($clen >= 4) {
1754
+ $features .= join("", @chars[ -4..-1 ]) . "\t";
1755
+ } elsif ($clen >= 3) {
1756
+ $features .= join("", @chars[ -3..-1 ]) . "\t";
1757
+ } elsif ($clen >= 2) {
1758
+ $features .= join("", @chars[ -2..-1 ]) . "\t";
1759
+ } else {
1760
+ $features .= $chars[ -1 ] . "\t";
1761
+ }
1762
+
1763
+ # Dictionary
1764
+ my $dict_status = (defined $dict{ $content_nl }) ? $dict{ $content_nl } : 0;
1765
+ # Possible names
1766
+ my ($publisher_name, $place_name, $month_name, $last_name, $female_name, $male_name) = undef;
1767
+ # Check all case
1768
+ if ($dict_status >= 32) { $dict_status -= 32; $publisher_name = "publisher" } else { $publisher_name = "no"; }
1769
+ if ($dict_status >= 16) { $dict_status -= 16; $place_name = "place" } else { $place_name = "no"; }
1770
+ if ($dict_status >= 8) { $dict_status -= 8; $month_name = "month" } else { $month_name = "no"; }
1771
+ if ($dict_status >= 4) { $dict_status -= 4; $last_name = "last" } else { $last_name = "no"; }
1772
+ if ($dict_status >= 2) { $dict_status -= 2; $female_name = "female" } else { $female_name = "no"; }
1773
+ if ($dict_status >= 1) { $dict_status -= 1; $male_name = "male" } else { $male_name = "no"; }
1774
+ # Save the feature
1775
+ $features .= $male_name . "\t";
1776
+ $features .= $female_name . "\t";
1777
+ $features .= $last_name . "\t";
1778
+ $features .= $month_name . "\t";
1779
+ $features .= $place_name . "\t";
1780
+ $features .= $publisher_name . "\t";
1781
+
1782
+ # First word in line
1783
+ if ($is_first_line == 1)
1784
+ {
1785
+ $features .= "begin" . "\t";
1786
+
1787
+ # Next words are not the first in line anymore
1788
+ $is_first_line = 0;
1789
+ }
1790
+ else
1791
+ {
1792
+ $features .= "continue" . "\t";
1793
+ }
1794
+
1795
+ ###
1796
+ # The following features are XML features
1797
+ ###
1798
+
1799
+ # Bold format
1800
+ $features .= $bold . "\t";
1801
+
1802
+ # Italic format
1803
+ $features .= $italic . "\t";
1804
+
1805
+ # Underline
1806
+ $features .= $underline . "\t";
1807
+
1808
+ # Sub-Sup-script
1809
+ $features .= $suscript . "\t";
1810
+
1811
+ # Relative font size
1812
+ $features .= $fontsize . "\t";
1813
+
1814
+ # First word in run
1815
+ if (($prev_bold ne $bold) || ($prev_italic ne $italic) || ($prev_underline ne $underline) || ($prev_suscript ne $suscript) || ($prev_fontsize ne $fontsize))
1816
+ {
1817
+ $features .= "fbegin" . "\t";
1818
+
1819
+ # Next words are not the first in line anymore
1820
+ # $is_first_run = 0;
1821
+ }
1822
+ else
1823
+ {
1824
+ $features .= "fcontinue" . "\t";
1825
+ }
1826
+
1827
+ # New token
1828
+ $features .= "\n";
1829
+
1830
+ # Save the XML format
1831
+ $prev_bold = $bold;
1832
+ $prev_italic = $italic;
1833
+ $prev_underline = $underline;
1834
+ $prev_suscript = $suscript;
1835
+ $prev_fontsize = $fontsize;
1836
+
1837
+ # NOTE: Relational classifier features
1838
+ # Content
1839
+ $rc_features .= $content . "\t";
1840
+ # Location
1841
+ $rc_features .= $top . "\t";
1842
+ $rc_features .= $bottom . "\t";
1843
+ $rc_features .= $left . "\t";
1844
+ $rc_features .= $right . "\t";
1845
+ # Index
1846
+ if (! defined $size_mismatch)
1847
+ {
1848
+ $rc_features .= $aut_addrs->[ $counter ]->{ 'L1' } . "\t";
1849
+ $rc_features .= $aut_addrs->[ $counter ]->{ 'L2' } . "\t";
1850
+ $rc_features .= $aut_addrs->[ $counter ]->{ 'L3' } . "\t";
1851
+ $rc_features .= $aut_addrs->[ $counter ]->{ 'L4' } . "\t";
1852
+ }
1853
+ # Done
1854
+ $rc_features .= "\n";
1855
+ }
1856
+ }
1857
+ }
1858
+ }
1859
+
1860
+ return ($features, $rc_features);
1861
+ }
1862
+
1863
+ sub ReadDict
1864
+ {
1865
+ my ($dictfile) = @_;
1866
+
1867
+ # Absolute path
1868
+ my $dictfile_abs = File::Spec->rel2abs($dictfile);
1869
+ # Dictionary handle
1870
+ my $dict_handle = undef;
1871
+ open ($dict_handle, "<:utf8", $dictfile_abs) || die "Could not open dict file $dictfile_abs: $!";
1872
+
1873
+ my $mode = 0;
1874
+ while (<$dict_handle>)
1875
+ {
1876
+ if (/^\#\# Male/) { $mode = 1; } # male names
1877
+ elsif (/^\#\# Female/) { $mode = 2; } # female names
1878
+ elsif (/^\#\# Last/) { $mode = 4; } # last names
1879
+ elsif (/^\#\# Chinese/) { $mode = 4; } # last names
1880
+ elsif (/^\#\# Months/) { $mode = 8; } # month names
1881
+ elsif (/^\#\# Place/) { $mode = 16; } # place names
1882
+ elsif (/^\#\# Publisher/) { $mode = 32; } # publisher names
1883
+ elsif (/^\#/) { next; }
1884
+ else
1885
+ {
1886
+ chop;
1887
+ my $key = $_;
1888
+ my $val = 0;
1889
+ # Has probability
1890
+ if (/\t/) { ($key,$val) = split (/\t/,$_); }
1891
+
1892
+ # Already tagged (some entries may appear in same part of lexicon more than once
1893
+ if (! exists $dict{ $key })
1894
+ {
1895
+ $dict{ $key } = $mode;
1896
+ }
1897
+ else
1898
+ {
1899
+ if ($dict{ $key } >= $mode)
1900
+ {
1901
+ next;
1902
+ }
1903
+ # Not yet tagged
1904
+ else
1905
+ {
1906
+ $dict{ $key } += $mode;
1907
+ }
1908
+ }
1909
+ }
1910
+ }
1911
+
1912
+ close ($dict_handle);
1913
+ }
1914
+
1915
+ sub BuildTmpFile
1916
+ {
1917
+ my ($filename) = @_;
1918
+
1919
+ my $tmpfile = $filename;
1920
+ $tmpfile =~ s/[\.\/]//g;
1921
+ $tmpfile .= $$ . time;
1922
+
1923
+ # Untaint tmpfile variable
1924
+ if ($tmpfile =~ /^([-\@\w.]+)$/)
1925
+ {
1926
+ $tmpfile = $1;
1927
+ }
1928
+
1929
+ return "/tmp/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
1930
+ }
1931
+
1932
+ 1;
1933
+
1934
+
1935
+
1936
+
1937
+
1938
+
1939
+
1940
+
1941
+
1942
+
1943
+
1944
+
1945
+
1946
+
1947
+
1948
+
1949
+