biblicit 2.0.3 → 2.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,1949 +0,0 @@
1
- package SectLabel::AAMatching;
2
-
3
- ###
4
- # This package provides methods to solve the matching problem
5
- # between author and affiliation in a pdf
6
- #
7
- # Do Hoang Nhat Huy 21 Apr, 11
8
- ###
9
-
10
- use strict;
11
-
12
- # Dependencies
13
- use POSIX;
14
- use IO::File;
15
- use XML::Writer;
16
- use XML::Writer::String;
17
-
18
- use Class::Struct;
19
-
20
- # Local libraries
21
- use SectLabel::Config;
22
- use ParsCit::PostProcess;
23
-
24
- # Dictionary
25
- my %dict = ();
26
- # CRF++
27
- my $crft = $SectLabel::Config::crf_test;
28
-
29
- # Matching features of each author, including
30
- # Signals
31
- # Coordinations: top, bottom, left, right
32
- # Position: page, sections, paragraph, line
33
- struct aut_rcfeatures =>
34
- {
35
- signals => '@',
36
-
37
- top => '$',
38
- bottom => '$',
39
- left => '$',
40
- right => '$',
41
-
42
- page => '$',
43
- section => '$',
44
- para => '$',
45
- line => '$'
46
- };
47
-
48
- # Matching features of each affiliation, including
49
- # Signals
50
- # Coordinations: top, bottom, left, right
51
- # Position: page, sections, paragraph, line
52
- struct aff_rcfeatures =>
53
- {
54
- signals => '@',
55
-
56
- top => '$',
57
- bottom => '$',
58
- left => '$',
59
- right => '$',
60
-
61
- page => '$',
62
- section => '$',
63
- para => '$',
64
- line => '$'
65
- };
66
-
67
- # Author
68
- # Affiliation
69
- sub AAMatching
70
- {
71
- my ($doc, $aut_addrs, $aff_addrs) = @_;
72
-
73
- my $need_object = 1;
74
- # Get the author objects
75
- my $aut_lines = Omni::Traversal::OmniCollector($doc, $aut_addrs, $need_object);
76
- # Get the affiliation objects
77
- my $aff_lines = Omni::Traversal::OmniCollector($doc, $aff_addrs, $need_object);
78
-
79
- # Dictionary
80
- ReadDict($FindBin::Bin . "/../" . $SectLabel::Config::dictFile);
81
-
82
- # Authors
83
- my ($aut_features, $aut_rc_features) = AuthorFeatureExtraction($aut_lines, $aut_addrs);
84
- # Call CRF
85
- my ($aut_signal, $aut_rc) = AuthorExtraction($aut_features, $aut_rc_features);
86
-
87
- # Affiliations
88
- my ($aff_features, $aff_rc_features) = AffiliationFeatureExtraction($aff_lines, $aff_addrs);
89
- # Call CRF
90
- my ($aff_signal, $aff_rc, $affs) = AffiliationExtraction($aff_features, $aff_rc_features);
91
-
92
- # Matching features
93
- my $aa_features = AAFeatureExtraction($aut_rc, $aff_rc);
94
- # Matching
95
- my $aa = AAMatchingImp($aa_features);
96
-
97
- =pod
98
- # DEBUG
99
- my $aut_handle = undef;
100
- my $aff_handle = undef;
101
- my $aau_handle = undef;
102
- my $aaf_handle = undef;
103
- my $aut_debug = undef;
104
- my $aff_debug = undef;
105
- my $aa_handle = undef;
106
-
107
- open $aut_handle, ">:utf8", "aut.features";
108
- open $aff_handle, ">:utf8", "aff.features";
109
- open $aau_handle, ">:utf8", "aau.features";
110
- open $aaf_handle, ">:utf8", "aaf.features";
111
- open $aut_debug, ">:utf8", "aut.debug.features";
112
- open $aff_debug, ">:utf8", "aff.debug.features";
113
- open $aa_handle, ">:utf8", "aa.features";
114
-
115
- print $aut_handle $aut_features;
116
- print $aff_handle $aff_features;
117
- print $aau_handle $aut_rc_features;
118
- print $aaf_handle $aff_rc_features;
119
- print $aa_handle $aa_features, "\n";
120
-
121
- foreach my $author (keys %{ $aut_rc } )
122
- {
123
- print $aut_debug $author, ": ", "\n";
124
-
125
- foreach my $signal (@{ $aut_rc->{ $author }->signals })
126
- {
127
- print $aut_debug "\t", $signal, "\n";
128
- }
129
-
130
- print $aut_debug "\t", $aut_rc->{ $author }->top, "\n";
131
- print $aut_debug "\t", $aut_rc->{ $author }->bottom, "\n";
132
- print $aut_debug "\t", $aut_rc->{ $author }->left, "\n";
133
- print $aut_debug "\t", $aut_rc->{ $author }->right, "\n";
134
-
135
- print $aut_debug "\t", $aut_rc->{ $author }->page, "\n";
136
- print $aut_debug "\t", $aut_rc->{ $author }->section, "\n";
137
- print $aut_debug "\t", $aut_rc->{ $author }->para, "\n";
138
- print $aut_debug "\t", $aut_rc->{ $author }->line, "\n";
139
- }
140
-
141
- foreach my $affiliation (keys %{ $aff_rc } )
142
- {
143
- print $aff_debug $affiliation, ": ", "\n";
144
-
145
- foreach my $signal (@{ $aff_rc->{ $affiliation }->signals })
146
- {
147
- print $aff_debug "\t", $signal, "\n";
148
- }
149
-
150
- print $aff_debug "\t", $aff_rc->{ $affiliation }->top, "\n";
151
- print $aff_debug "\t", $aff_rc->{ $affiliation }->bottom, "\n";
152
- print $aff_debug "\t", $aff_rc->{ $affiliation }->left, "\n";
153
- print $aff_debug "\t", $aff_rc->{ $affiliation }->right, "\n";
154
-
155
- print $aff_debug "\t", $aff_rc->{ $affiliation }->page, "\n";
156
- print $aff_debug "\t", $aff_rc->{ $affiliation }->section, "\n";
157
- print $aff_debug "\t", $aff_rc->{ $affiliation }->para, "\n";
158
- print $aff_debug "\t", $aff_rc->{ $affiliation }->line, "\n";
159
- }
160
-
161
- close $aut_handle;
162
- close $aff_handle;
163
- close $aau_handle;
164
- close $aaf_handle;
165
- close $aut_debug;
166
- close $aff_debug;
167
- close $aa_handle;
168
- # END
169
- =cut
170
-
171
- # Do the matching
172
- # XML string
173
- my $sxml = "";
174
- # and XML writer
175
- my $writer = new XML::Writer(OUTPUT => \$sxml, ENCODING => 'utf-8', DATA_MODE => 'true', DATA_INDENT => 2);
176
-
177
- # Algorithm
178
- $writer->startTag("algorithm", "name" => "AAMatching", "version" => $SectLabel::Config::algorithmVersion);
179
-
180
- # XML header
181
- my $date = `date`; chomp($date);
182
- my $time = `date +%s`; chomp($time);
183
- # Write XML header
184
- $writer->startTag("results", "time" => $time, "date" => $date);
185
-
186
- # Write authors
187
- $writer->startTag("authors");
188
-
189
- # Write the author name and his corresponding institution
190
- foreach my $author (keys %{ $aut_signal })
191
- {
192
- $writer->startTag("author");
193
-
194
- $writer->startTag("fullname", "source" => "parscit");
195
- $writer->characters($author);
196
- $writer->endTag("fullname");
197
-
198
- $writer->startTag("institutions");
199
- =pod
200
- foreach my $signal (@{ $aut_signal->{ $author } })
201
- {
202
- $signal =~ s/^\s+|\s+$//g;
203
- # Skip blank
204
- if ($signal eq "") { next; }
205
-
206
- $writer->startTag("institution", "symbol" => $signal);
207
- $writer->characters($aff_signal->{ $signal });
208
- $writer->endTag("institution");
209
- }
210
- =cut
211
-
212
- foreach my $affiliation (@{ $aa->{ $author } })
213
- {
214
- $writer->startTag("institution");
215
- $writer->characters($affiliation);
216
- $writer->endTag("institution");
217
- }
218
-
219
- $writer->endTag("institutions");
220
-
221
- $writer->endTag("author");
222
- }
223
-
224
- # Finish authors
225
- $writer->endTag("authors");
226
-
227
- # Write institutions
228
- $writer->startTag("institutions");
229
-
230
- # Write the instituion name
231
- foreach my $institute (@{ $affs })
232
- {
233
- $writer->startTag("institution");
234
- $writer->characters($institute);
235
- $writer->endTag("institution");
236
- }
237
-
238
- $writer->endTag("institutions");
239
-
240
- # Done
241
- $writer->endTag("results");
242
- # Done
243
- $writer->endTag("algorithm");
244
- # Done
245
- $writer->end();
246
-
247
- # Return the xml content back to the caller
248
- return $sxml;
249
- }
250
-
251
- # Features of the relational classifier between author and affiliation
252
- sub AAFeatureExtraction
253
- {
254
- my ($aut_rc, $aff_rc) = @_;
255
-
256
- # Relational features
257
- my $features = "";
258
-
259
- # Features between x authors
260
- foreach my $author (keys %{ $aut_rc })
261
- {
262
- my @aut_tokens = split /\s/, $author;
263
- my $author_nb = join '|||', @aut_tokens;
264
-
265
- my $min_aff_x = undef;
266
- my $min_dist_x = LONG_MAX;
267
- my $min_aff_y = undef;
268
- my $min_dist_y = LONG_MAX;
269
- # Find the nearest affiliation
270
- foreach my $aff (keys %{ $aff_rc })
271
- {
272
- my $aut_x = ($aut_rc->{ $author }->left + $aut_rc->{ $author }->right) / 2;
273
- my $aut_y = ($aut_rc->{ $author }->top + $aut_rc->{ $author }->bottom) / 2;
274
-
275
- my $aff_x = ($aff_rc->{ $aff }->left + $aff_rc->{ $aff }->right) / 2;
276
- my $aff_y = ($aff_rc->{ $aff }->top + $aff_rc->{ $aff }->bottom) / 2;
277
-
278
- my $dis_x = abs( $aut_x - $aff_x );
279
- my $dis_y = abs( $aut_y - $aff_y );
280
- # Distance between an author and an affiliation
281
- # my $distance = sqrt( $dis_x * $dis_x + $dis_y * $dis_y );
282
-
283
- # Check if it it the minimum distance in x axis
284
- if ($dis_x < $min_dist_x)
285
- {
286
- $min_dist_x = $dis_x;
287
- $min_aff_x = $aff;
288
- }
289
-
290
- # Check if it it the minimum distance in y axis
291
- if ($dis_y < $min_dist_y)
292
- {
293
- $min_dist_y = $dis_y;
294
- $min_aff_y = $aff;
295
- }
296
- }
297
-
298
- # and y affiliation
299
- foreach my $aff (keys %{ $aff_rc })
300
- {
301
- my @aff_tokens = split /\s/, $aff;
302
- my $aff_nb = join '|||', @aff_tokens;
303
-
304
- # Content
305
- $features .= $author_nb . "#" . $aff_nb . "\t";
306
-
307
- my $signal = undef;
308
- # Signal
309
- if ((scalar(@{ $aut_rc->{ $author }->signals }) == 0) || (scalar(@{ $aff_rc->{ $aff }->signals }) == 0))
310
- {
311
- $signal = "diff";
312
- }
313
- else
314
- {
315
- my $matched = undef;
316
- # Check each author signal
317
- foreach my $aut_sig (@{ $aut_rc->{ $author }->signals })
318
- {
319
- # if it match with affiliation signal
320
- if ($aut_sig eq ${ $aff_rc->{ $aff }->signals }[ 0 ]) { $matched = 1; last; }
321
- }
322
-
323
- $signal = (! defined $matched) ? "diff" : "same";
324
- }
325
-
326
- # Signal
327
- $features .= $signal . "\t";
328
-
329
- # Same page
330
- my $page = ($aut_rc->{ $author }->page == $aff_rc->{ $aff }->page) ? "yes" : "no";
331
- $features .= $page . "\t";
332
-
333
- my $section = undef;
334
- # Same section
335
- if ($page eq "yes")
336
- {
337
- $section = ($aut_rc->{ $author }->section == $aff_rc->{ $aff }->section) ? "yes" : "no";
338
- $features .= $section . "\t";
339
- }
340
- else
341
- {
342
- $section = "no";
343
- $features .= $section . "\t";
344
- }
345
-
346
- my $para = undef;
347
- # Same paragraph
348
- if (($page eq "yes") && ($section eq "yes"))
349
- {
350
- $para = ($aut_rc->{ $author }->para == $aff_rc->{ $aff }->para) ? "yes" : "no";
351
- $features .= $para . "\t";
352
- }
353
- else
354
- {
355
- $para = "no";
356
- $features .= $para . "\t";
357
- }
358
-
359
- my $line = undef;
360
- # Same line
361
- if (($page eq "yes") && ($section eq "yes") && ($para eq "yes"))
362
- {
363
- $line = ($aut_rc->{ $author }->line == $aff_rc->{ $aff }->line) ? "yes" : "no";
364
- $features .= $line . "\t";
365
- }
366
- else
367
- {
368
- $line = "no";
369
- $features .= $line . "\t";
370
- }
371
-
372
- # Is neartest affiliation in x axis ?
373
- my $nearest_x = ($aff eq $min_aff_x) ? "yes" : "no";
374
- $features .= $nearest_x . "\t";
375
-
376
- # Is neartest affiliation in y axis ?
377
- my $nearest_y = ($aff eq $min_aff_y) ? "yes" : "no";
378
- $features .= $nearest_y . "\n";
379
- }
380
- }
381
-
382
- return $features;
383
- }
384
-
385
- # Actually do the matching between author and affiliation
386
- sub AAMatchingImp
387
- {
388
- my ($features) = @_;
389
-
390
- # Temporary input file for CRF
391
- my $infile = BuildTmpFile("aa-input");
392
- # Temporary output file for CRF
393
- my $outfile = BuildTmpFile("aa-output");
394
-
395
- my $output_handle = undef;
396
- # Split and write to temporary input
397
- open $output_handle, ">:utf8", $infile;
398
- # Split
399
- my @lines = split /\n/, $features;
400
- # and write
401
- foreach my $line (@lines)
402
- {
403
- if ($line eq "")
404
- {
405
- print $output_handle "\n";
406
- }
407
- else
408
- {
409
- print $output_handle $line, "\t", "no", "\n";
410
- }
411
- }
412
- # Done
413
- close $output_handle;
414
-
415
- # AA matching model
416
- my $match_model = $SectLabel::Config::matFile;
417
-
418
- # Matching
419
- system("$crft -m $match_model $infile > $outfile");
420
-
421
- # List of authors and their affiliation (if exists)
422
- my %aa = ();
423
-
424
- my $input_handle = undef;
425
- # Read the CRF output
426
- open $input_handle, "<:utf8", $outfile;
427
- # Read each line and get its label
428
- while (<$input_handle>)
429
- {
430
- my $line = $_;
431
- # Trim
432
- $line =~ s/^\s+|\s+$//g;
433
- # Blank linem, what the heck ?
434
- if ($line eq "") { next; }
435
-
436
- # Split the line
437
- my @fields = split /\t/, $line;
438
- # and extract the class and the content
439
- my $class = $fields[ -1 ];
440
- my $content = $fields[ 0 ];
441
-
442
- # You miss
443
- if ($class ne "yes") { next; }
444
-
445
- # Split the content into author name and affiliation name
446
- my @tmp = split /#/, $content;
447
- # Author name
448
- my $author = $tmp[ 0 ];
449
- $author =~ s/\|\|\|/ /g;
450
- # Affiliation name
451
- my $aff = $tmp[ 1 ];
452
- $aff =~ s/\|\|\|/ /g;
453
-
454
- # Save
455
- if (! exists $aa{ $author }) { $aa{ $author } = (); }
456
- # Save
457
- push @{ $aa{ $author } }, $aff;
458
- }
459
-
460
- # Done
461
- close $input_handle;
462
-
463
- # Clean up
464
- unlink $infile;
465
- unlink $outfile;
466
- # Done
467
- return (\%aa);
468
- }
469
-
470
- # Extract affiliation and their signal using crf
471
- sub AffiliationExtraction
472
- {
473
- my ($features, $rc_features) = @_;
474
-
475
- # Temporary input file for CRF
476
- my $infile = BuildTmpFile("aff-input");
477
- # Temporary output file for CRF
478
- my $outfile = BuildTmpFile("aff-output");
479
-
480
- my $output_handle = undef;
481
- # Split and write to temporary input
482
- open $output_handle, ">:utf8", $infile;
483
- # Split
484
- my @lines = split /\n/, $features;
485
- # and write
486
- foreach my $line (@lines)
487
- {
488
- if ($line eq "")
489
- {
490
- print $output_handle "\n";
491
- }
492
- else
493
- {
494
- print $output_handle $line, "\t", "affiliation", "\n";
495
- }
496
- }
497
- # Done
498
- close $output_handle;
499
-
500
- # Author model
501
- my $aff_model = $SectLabel::Config::affFile;
502
-
503
- # Split the authors
504
- system("$crft -m $aff_model $infile > $outfile");
505
-
506
- # Each affiliation can have only one signal
507
- my %asg = ();
508
- # Each affilitiaon can have only one struct
509
- my %aaf = ();
510
- # List of all affiliations
511
- my @aff = ();
512
-
513
- # Each line in the relational features string
514
- my @rc_lines = split /\n/, $rc_features;
515
-
516
- my $input_handle = undef;
517
- # Read the CRF output
518
- open $input_handle, "<:utf8", $outfile;
519
- # Author and signal string
520
- my $prev_class = "";
521
- my @aff_str = ();
522
- my $signal_str = "";
523
- # Relational classifier
524
- my @aaf_rc = ();
525
- # Line counter
526
- my $counter = 0;
527
- # Next to last signal
528
- my $ntl_signal = "";
529
- # Read each line and get its label
530
- # TODO: The code assumes that an affiliation will have the following format: 1 foobar institute
531
- while (<$input_handle>)
532
- {
533
- my $line = $_;
534
- # Trim
535
- $line =~ s/^\s+|\s+$//g;
536
- # Blank line mark the end of an affiliation section
537
- if ($line eq "")
538
- {
539
- if ($prev_class eq "affiliation")
540
- {
541
- my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
542
- # Save the affiliation
543
- push @aff, $affiliation;
544
- # and its signal
545
- if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }
546
-
547
- # Save the signal
548
- push @{ $rcs->signals }, $ntl_signal;
549
- # Save the record
550
- $aaf{ $affiliation } = $rcs;
551
- }
552
- elsif ($prev_class eq "signal")
553
- {
554
- # Save the next to last signal
555
- $ntl_signal = NormalizeAffiliationSignal($signal_str);
556
- }
557
-
558
- # Cleanup
559
- $ntl_signal = "";
560
- # Cleanup
561
- @aff_str = ();
562
- $signal_str = "";
563
- $prev_class = "";
564
- # Cleanup
565
- @aaf_rc = ();
566
-
567
- # Update the counter
568
- $counter++;
569
-
570
- next ;
571
- }
572
-
573
- # Split the line
574
- my @fields = split /\t/, $line;
575
- # and extract the class and the content
576
- my $class = $fields[ -1 ];
577
- my $content = $fields[ 0 ];
578
-
579
- if ($class eq $prev_class)
580
- {
581
- # An affiliation
582
- if ($class eq "affiliation")
583
- {
584
- push @aff_str, $content;
585
- push @aaf_rc, $rc_lines[ $counter ];
586
- }
587
- # A signal
588
- elsif ($class eq "signal")
589
- {
590
- $signal_str .= $content . " ";
591
- }
592
- }
593
- else
594
- {
595
- if ($prev_class eq "affiliation")
596
- {
597
- my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
598
- # Save the affiliation
599
- push @aff, $affiliation;
600
- # and its signal
601
- if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }
602
-
603
- # Save the signal
604
- push @{ $rcs->signals }, $ntl_signal;
605
- # Save the record
606
- $aaf{ $affiliation } = $rcs;
607
-
608
- }
609
- elsif ($prev_class eq "signal")
610
- {
611
- # Save the next to last signal
612
- $ntl_signal = NormalizeAffiliationSignal($signal_str);
613
- }
614
-
615
- # Cleanup
616
- @aff_str = ();
617
- $signal_str = "";
618
- @aaf_rc = ();
619
- # Switch to the current class
620
- $prev_class = $class;
621
-
622
- if ($class eq "affiliation")
623
- {
624
- push @aff_str, $content;
625
- push @aaf_rc, $rc_lines[ $counter ];
626
- }
627
- elsif ($class eq "signal")
628
- {
629
- $signal_str .= $content . " ";
630
- }
631
- }
632
-
633
- # Update the counter
634
- $counter++;
635
- }
636
-
637
- # Final class
638
- if ($prev_class eq "affiliation")
639
- {
640
- my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
641
- # Save the affiliation
642
- push @aff, $affiliation;
643
- # and its signal
644
- if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }
645
-
646
- # Save the signal
647
- push @{ $rcs->signals }, $ntl_signal;
648
- # Save the record
649
- $aaf{ $affiliation } = $rcs;
650
- }
651
- elsif ($prev_class eq "signal")
652
- {
653
- # Save the next to last signal
654
- $ntl_signal = NormalizeAffiliationSignal($signal_str);
655
- }
656
-
657
- # Done
658
- close $input_handle;
659
-
660
- # Clean up
661
- unlink $infile;
662
- unlink $outfile;
663
- # Done
664
- return (\%asg, \%aaf, \@aff);
665
- }
666
-
667
- sub NormalizeAffiliationSignal
668
- {
669
- my ($signal_str) = @_;
670
-
671
- # Trim
672
- $signal_str =~ s/^\s+|\s+$//g;
673
- # Remove all space inside the signature
674
- $signal_str =~ s/\s+//g;
675
-
676
- # Done
677
- return $signal_str;
678
- }
679
-
680
- sub NormalizeAffiliationName
681
- {
682
- my ($aff_str, $aaf_rc) = @_;
683
-
684
- # Constraint
685
- if (scalar(@{ $aff_str }) != scalar(@{ $aaf_rc })) { print STDERR "# It cannot happen, if you encounter it, please consider report it as a bug", "\n"; die; }
686
-
687
- # Affiliation string
688
- my $affiliation = join ' ', @{ $aff_str };
689
-
690
- # First word
691
- my @fields = split /\s/, $aaf_rc->[ 0 ];
692
- # Save the relational features of an affiliation (its first word)
693
- my $rcs = aff_rcfeatures->new( signals => [],
694
- top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
695
- page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ] );
696
- # Done
697
- return ($affiliation, $rcs);
698
- }
699
-
700
- # Extract author name and their signal using crf
701
- sub AuthorExtraction
702
- {
703
- my ($features, $rc_features) = @_;
704
-
705
- # Temporary input file for CRF
706
- my $infile = BuildTmpFile("aut-input");
707
- # Temporary output file for CRF
708
- my $outfile = BuildTmpFile("aut-output");
709
-
710
- my $output_handle = undef;
711
- # Split and write to temporary input
712
- open $output_handle, ">:utf8", $infile;
713
- # Split
714
- my @lines = split /\n/, $features;
715
- # and write
716
- foreach my $line (@lines)
717
- {
718
- if ($line eq "")
719
- {
720
- print $output_handle "\n";
721
- }
722
- else
723
- {
724
- print $output_handle $line, "\t", "ns", "\n";
725
- }
726
- }
727
- # Done
728
- close $output_handle;
729
-
730
- # Author model
731
- my $author_model = $SectLabel::Config::autFile;
732
-
733
- # Split the authors
734
- system("$crft -m $author_model $infile > $outfile");
735
-
736
- # Each author can have one or more signals
737
- my %asg = ();
738
- # Each author can have only one struct
739
- my %aas = ();
740
-
741
- # Each line in the relational features string
742
- my @rc_lines = split /\n/, $rc_features;
743
-
744
- my $input_handle = undef;
745
- # Read the CRF output
746
- open $input_handle, "<:utf8", $outfile;
747
- # Author and signal string
748
- my $prev_class = "";
749
- my @author_str = ();
750
- my $signal_str = "";
751
- # Relational classifier
752
- my @author_rc = ();
753
- # Line counter
754
- my $counter = 0;
755
- # Next to last authors
756
- my %ntl_asg = ();
757
- #
758
- my $is_authors = 0;
759
- # Read each line and get its label
760
- while (<$input_handle>)
761
- {
762
- my $line = $_;
763
- # Trim
764
- $line =~ s/^\s+|\s+$//g;
765
- # Blank line mark the end of an author section
766
- if ($line eq "")
767
- {
768
- if ($prev_class eq "author")
769
- {
770
- my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
771
- # Save each author
772
- for (my $i = 0; $i < scalar(@{ $authors }); $i++)
773
- {
774
- $asg{ $authors->[ $i ] } = ();
775
- $aas{ $authors->[ $i ] } = $rcs->[ $i ];
776
- $ntl_asg{ $authors->[ $i ] } = 0;
777
- }
778
- }
779
- elsif ($prev_class eq "signal")
780
- {
781
- my $signals = NormalizeAuthorSignal($signal_str);
782
- # Save each signal to its corresponding author
783
- foreach my $author (keys %ntl_asg)
784
- {
785
- foreach my $signal (@{ $signals })
786
- {
787
- push @{ $asg{ $author } }, $signal;
788
- push @{ $aas{ $author }->signals }, $signal;
789
- }
790
- }
791
- }
792
-
793
- # Cleanup
794
- %ntl_asg = ();
795
- # Cleanup
796
- @author_str = ();
797
- $signal_str = "";
798
- @author_rc = ();
799
- # Cleanup
800
- $prev_class = "";
801
-
802
- # Update the counter
803
- $counter++;
804
-
805
- #
806
- $is_authors = 0;
807
-
808
- next;
809
- }
810
-
811
- # Split the line
812
- my @fields = split /\t/, $line;
813
- # and extract the class and the content
814
- my $class = $fields[ -1 ];
815
- my $content = $fields[ 0 ];
816
-
817
- if ($class eq $prev_class)
818
- {
819
- # An author
820
- if ($class eq "author")
821
- {
822
- push @author_str, $content;
823
- push @author_rc, $rc_lines[ $counter ];
824
- }
825
- # A signal
826
- elsif ($class eq "signal")
827
- {
828
- $signal_str .= $content . " ";
829
- }
830
- }
831
- else
832
- {
833
- if ($prev_class eq "author")
834
- {
835
- my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
836
- # Save each author
837
- for (my $i = 0; $i < scalar(@{ $authors }); $i++)
838
- {
839
- $asg{ $authors->[ $i ] } = ();
840
- $aas{ $authors->[ $i ] } = $rcs->[ $i ];
841
- $ntl_asg{ $authors->[ $i ] } = 0;
842
- }
843
- }
844
- elsif ($prev_class eq "signal")
845
- {
846
- my $signals = NormalizeAuthorSignal($signal_str);
847
- # Save each signal to its corresponding author
848
- foreach my $author (keys %ntl_asg)
849
- {
850
- foreach my $signal (@{ $signals })
851
- {
852
- push @{ $asg{ $author } }, $signal;
853
- push @{ $aas{ $author }->signals }, $signal;
854
- }
855
- }
856
- }
857
-
858
- # Clean the next to last author list if this current class is author
859
- if (($is_authors == 0) && ($class eq "author")) { %ntl_asg = (); $is_authors = 1; }
860
- #
861
- if ($class eq "signal") { $is_authors = 0; }
862
-
863
- # Cleanup
864
- @author_str = ();
865
- $signal_str = "";
866
- @author_rc = ();
867
- # Switch to the current class
868
- $prev_class = $class;
869
-
870
- if ($class eq "author")
871
- {
872
- push @author_str, $content;
873
- push @author_rc, $rc_lines[ $counter ];
874
- }
875
- elsif ($class eq "signal")
876
- {
877
- $signal_str .= $content . " ";
878
- }
879
- }
880
-
881
- # Update the counter
882
- $counter++;
883
- }
884
-
885
- # Final class
886
- if ($prev_class eq "author")
887
- {
888
- my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
889
- # Save each author
890
- for (my $i = 0; $i < scalar(@{ $authors }); $i++)
891
- {
892
- $asg{ $authors->[ $i ] } = ();
893
- $aas{ $authors->[ $i ] } = $rcs->[ $i ];
894
- $ntl_asg{ $authors->[ $i ] } = 0;
895
- }
896
- }
897
- elsif ($prev_class eq "signal")
898
- {
899
- my $signals = NormalizeAuthorSignal($signal_str);
900
- # Save each signal to its corresponding author
901
- foreach my $author (keys %ntl_asg)
902
- {
903
- foreach my $signal (@{ $signals })
904
- {
905
- push @{ $asg{ $author } }, $signal;
906
- push @{ $aas{ $author }->signals }, $signal;
907
- }
908
- }
909
- }
910
-
911
- # Done
912
- close $input_handle;
913
-
914
- # Clean up
915
- unlink $infile;
916
- unlink $outfile;
917
- # Done
918
- return (\%asg, \%aas);
919
- }
920
-
921
- sub NormalizeAuthorNames
922
- {
923
- my ($author_str, $author_rc) = @_;
924
-
925
- # Constraint
926
- if (scalar(@{ $author_str }) != scalar(@{ $author_rc })) { print STDERR "# It cannot happen, if you encounter it, please consider report it as a bug", "\n"; die; }
927
-
928
- # Mark the beginning of an author name
929
- my $begin = 1;
930
- # and its corresponding relational features
931
- my $rcbegin = 0;
932
-
933
- my @current = ();
934
- my @authors = ();
935
- my @rcs = ();
936
- # Check all tokens in the author string
937
- for (my $i = 0; $i < scalar(@{ $author_str }); $i++)
938
- {
939
- my $token = $author_str->[ $i ];
940
-
941
- # Mark the end of an author name
942
- if ($token =~ m/^(&|and|,|;)$/i)
943
- {
944
- if (scalar(@current) != 0)
945
- {
946
- push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);
947
-
948
- # Save the relational features of an author (its first word)
949
- my @fields = split /\s/, $author_rc->[ $rcbegin ];
950
- # Create new record
951
- my $tmp = aut_rcfeatures->new( signals => [],
952
- top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
953
- page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ] );
954
- # Save the record
955
- push @rcs, $tmp;
956
- }
957
-
958
- # Cleanup
959
- @current = ();
960
- $begin = 1;
961
-
962
- next;
963
- }
964
-
965
- # Mark the begin of an author name
966
- if ($begin == 1)
967
- {
968
- push @current, $token;
969
-
970
- $begin = 0;
971
- $rcbegin = $i;
972
-
973
- next;
974
- }
975
-
976
- # Author name ending with a comma
977
- if ($token =~ m/,$/)
978
- {
979
- push @current, $token;
980
-
981
- if (scalar(@current) != 0)
982
- {
983
- push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);
984
-
985
- # Save the relational features of an author (its first word)
986
- my @fields = split /\s/, $author_rc->[ $rcbegin ];
987
- # Create new record
988
- my $tmp = aut_rcfeatures->new( signals => [],
989
- top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
990
- page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ] );
991
- # Save the record
992
- push @rcs, $tmp;
993
- }
994
-
995
- # Cleanup
996
- @current = ();
997
- $begin = 1;
998
- }
999
- # or it's just parts of the name
1000
- else
1001
- {
1002
- push @current, $token;
1003
- }
1004
- }
1005
-
1006
- # Last author name
1007
- if (scalar(@current) != 0)
1008
- {
1009
- push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);
1010
-
1011
- # Save the relational features of an author (its first word)
1012
- my @fields = split /\s/, $author_rc->[ $rcbegin ];
1013
- # Create new record
1014
- my $tmp = aut_rcfeatures->new( signals => [],
1015
- top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
1016
- page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ] );
1017
- # Save the record
1018
- push @rcs, $tmp;
1019
- }
1020
-
1021
- # Done
1022
- return (\@authors, \@rcs);
1023
- }
1024
-
1025
- #
1026
- sub NormalizeAuthorSignal
1027
- {
1028
- my ($signal_str) = @_;
1029
-
1030
- # Trim
1031
- $signal_str =~ s/^\s+|\s+$//g;
1032
- # Split into individual signal
1033
- my @signals = split / |,|:|;/, $signal_str;
1034
-
1035
- # Done
1036
- return \@signals;
1037
- }
1038
-
1039
- # Extract features from affiliation lines
1040
- # The list of features include
1041
- # Content
1042
- # Content, lower case, no punctuation
1043
- # Content length
1044
- # First word in line
1045
- #
1046
- # XML features
1047
- # Subscript, superscript
1048
- # Bold
1049
- # Italic
1050
- # Underline
1051
- # Relative font size
1052
- # Differentiate features
1053
- sub AffiliationFeatureExtraction
1054
- {
1055
- my ($aff_lines, $aff_addrs) = @_;
1056
-
1057
- # NOTE: Relational classifier features
1058
- my $rc_features = "";
1059
-
1060
- # Features will be stored here
1061
- my $features = "";
1062
- # First word in line
1063
- my $is_first_line = undef;
1064
-
1065
- # Font size
1066
- my %fonts = ();
1067
- # Each line contains many runs
1068
- foreach my $line (@{ $aff_lines })
1069
- {
1070
- my $runs = $line->get_objs_ref();
1071
- # Iterator though all work in all lines
1072
- foreach my $run (@{ $runs })
1073
- {
1074
- my $fsize = $run->get_font_size();
1075
- my $words = $run->get_objs_ref();
1076
-
1077
- # Statistic
1078
- if (! exists $fonts{ $fsize })
1079
- {
1080
- $fonts{ $fsize } = scalar(@{ $words });
1081
- }
1082
- else
1083
- {
1084
- $fonts{ $fsize } += scalar(@{ $words });
1085
- }
1086
- }
1087
- }
1088
-
1089
- my $dominate_font = undef;
1090
- # Sort all the font descend with the number of their appearance
1091
- my @sorted = sort { $fonts{ $b } <=> $fonts{ $a } } keys %fonts;
1092
- # Select the dominated font
1093
- $dominate_font = $sorted[ 0 ];
1094
-
1095
- my $size_mismatch = undef;
1096
- # TODO: serious error if the size of aff_lines and the size of aff_addrs mismatch
1097
- if (scalar(@{ $aff_lines }) != scalar(@{ $aff_addrs }))
1098
- {
1099
- $size_mismatch = 1;
1100
- # Print the error but still try to continue
1101
- print STDERR "# Total number of affiliation lines (" . scalar(@{ $aff_lines }) . ") != Total number of affiliation addresses (" . scalar(@{ $aff_addrs }) . ")." . "\n";
1102
- }
1103
-
1104
- my $prev_page = undef;
1105
- my $prev_sect = undef;
1106
- my $prev_para = undef;
1107
- # Each line contains many runs
1108
- for (my $counter = 0; $counter < scalar(@{ $aff_lines }); $counter++)
1109
- {
1110
- # Get the line object
1111
- my $line = $aff_lines->[ $counter ];
1112
-
1113
- # Check the size of aff_lines and aff_addrs
1114
- if (! defined $size_mismatch)
1115
- {
1116
- # Check if two consecutive lines are from two different sections
1117
- if (! defined $prev_page)
1118
- {
1119
- # Init
1120
- $prev_page = $aff_addrs->[ $counter ]->{ 'L1' };
1121
- $prev_sect = $aff_addrs->[ $counter ]->{ 'L2' };
1122
- $prev_para = $aff_addrs->[ $counter ]->{ 'L3' };
1123
- }
1124
- else
1125
- {
1126
- # Affiliations from different sections will be separated immediately
1127
- if (($prev_page != $aff_addrs->[ $counter ]->{ 'L1' }) ||
1128
- ($prev_sect != $aff_addrs->[ $counter ]->{ 'L2' }) ||
1129
- ($prev_para != $aff_addrs->[ $counter ]->{ 'L3' }))
1130
- {
1131
- $features .= "\n";
1132
-
1133
- # NOTE: Relational classifier features
1134
- $rc_features .= "\n";
1135
- }
1136
-
1137
- # Save the paragraph index
1138
- $prev_page = $aff_addrs->[ $counter ]->{ 'L1' };
1139
- $prev_sect = $aff_addrs->[ $counter ]->{ 'L2' };
1140
- $prev_para = $aff_addrs->[ $counter ]->{ 'L3' };
1141
- }
1142
- }
1143
-
1144
- # Set first word in line
1145
- $is_first_line = 1;
1146
-
1147
- # Two previous words
1148
- my $prev_word = undef;
1149
- my $prev_prev_word = undef;
1150
-
1151
- # Format of the previous word
1152
- my ($prev_bold, $prev_italic, $prev_underline, $prev_suscript, $prev_fontsize) = "unknown";
1153
-
1154
- my $runs = $line->get_objs_ref();
1155
- # Iterator though all work in all lines
1156
- foreach my $run (@{ $runs })
1157
- {
1158
- # The run must be non-empty
1159
- my $tmp = $run->get_content();
1160
- # Trim
1161
- $tmp =~ s/^\s+|\s+$//g;
1162
- # Skip blank run
1163
- if ($tmp eq "") { next; }
1164
-
1165
- ###
1166
- # The following features are XML features
1167
- ###
1168
-
1169
- # Bold format
1170
- my $bold = ($run->get_bold() eq "true") ? "bold" : "none";
1171
-
1172
- # Italic format
1173
- my $italic = ($run->get_italic() eq "true") ? "italic" : "none";
1174
-
1175
- # Underline
1176
- my $underline = ($run->get_underline() eq "true") ? "underline" : "none";
1177
-
1178
- # Sub-Sup-script
1179
- my $suscript = ($run->get_suscript() eq "superscript") ? "super" :
1180
- ($run->get_suscript() eq "subscript") ? "sub" : "none";
1181
-
1182
- # Relative font size
1183
- my $fontsize = ($run->get_font_size() > $dominate_font) ? "large" :
1184
- ($run->get_font_size() < $dominate_font) ? "small" : "normal";
1185
-
1186
- ###
1187
- # End of XML features
1188
- ###
1189
-
1190
- # All words in the run
1191
- my $words = $run->get_objs_ref();
1192
-
1193
- # For each word
1194
- foreach my $word (@{ $words })
1195
- {
1196
- # Get word location
1197
- my $top = $word->get_top_pos();
1198
- my $bottom = $word->get_bottom_pos();
1199
- my $left = $word->get_left_pos();
1200
- my $right = $word->get_right_pos();
1201
-
1202
- # NOTE: heuristic rule, for words in the same line
1203
- # If the x-axis distance between this word and the previous word is
1204
- # three times larger than the distance between the previous word and
1205
- # the word before it, then it marks the separator.
1206
- # The better way to do this is to introduce it as a new feature in the
1207
- # author and affiliation model but this step requires re-training these
1208
- # two models, so ...
1209
- #
1210
- # NOTE: Assuming left to right writing
1211
- if (! defined $prev_word)
1212
- {
1213
- $prev_word = $word;
1214
- }
1215
- elsif (! defined $prev_prev_word)
1216
- {
1217
- # NOTE: Words have the power to both destroy and heal, when words are both
1218
- # true and kind, they can change our world
1219
- if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
1220
- {
1221
- $prev_prev_word = $prev_word;
1222
- $prev_word = $word;
1223
- }
1224
- }
1225
- else
1226
- {
1227
- # NOTE: Words have the power to both destroy and heal, when words are both
1228
- # true and kind, they can change our world
1229
- if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
1230
- {
1231
- my $prev_dist = abs ($prev_word->get_left_pos() - $prev_prev_word->get_right_pos());
1232
- my $curr_dist = abs ($word->get_left_pos() - $prev_word->get_right_pos());
1233
-
1234
- if ($prev_dist * 5 < $curr_dist)
1235
- {
1236
- $features .= "\n";
1237
-
1238
- # NOTE: Relational classifier features
1239
- $rc_features .= "\n";
1240
- }
1241
-
1242
- $prev_prev_word = $prev_word;
1243
- $prev_word = $word;
1244
- }
1245
- }
1246
-
1247
- # Extract features
1248
- my $full_content = $word->get_content();
1249
- # Trim
1250
- $full_content =~ s/^\s+|\s+$//g;
1251
-
1252
- # Skip blank run
1253
- if ($full_content eq "") { next; }
1254
-
1255
- my @sub_content = ();
1256
- # This is the tricky part, one word e.g. **affiliation will be
1257
- # splitted into two parts: the signal, and the affiliation if
1258
- # possible using regular expression
1259
- while ($full_content =~ m/([\w|-]*)(\W*)/g)
1260
- {
1261
- my $first = $1;
1262
- my $second = $2;
1263
-
1264
- # Trim
1265
- $first =~ s/^\s+|\s+$//g;
1266
- $second =~ s/^\s+|\s+$//g;
1267
-
1268
- # Only keep non-blank content
1269
- if ($first ne "") { push @sub_content, $first; }
1270
-
1271
- # Check the signal and separator
1272
- while ($second =~ m/([,|\.|:|;]*)([^,\.:;]*)/g)
1273
- {
1274
- my $sub_first = $1;
1275
- my $sub_second = $2;
1276
-
1277
- # Trim
1278
- $sub_first =~ s/^\s+|\s+$//g;
1279
- $sub_second =~ s/^\s+|\s+$//g;
1280
-
1281
- # Only keep non-blank separator
1282
- if ($sub_first ne "") { push @sub_content, $sub_first; }
1283
- # Only keep non-blank signal
1284
- if ($sub_second ne "") { push @sub_content, $sub_second; }
1285
- }
1286
- }
1287
-
1288
- foreach my $content (@sub_content)
1289
- {
1290
- # Content
1291
- $features .= $content . "\t";
1292
-
1293
- my $content_n = $content;
1294
- # Remove punctuation
1295
- $content_n =~ s/[^\w]//g;
1296
- # Lower case
1297
- my $content_l = lc($content);
1298
- # Lower case, no punctuation
1299
- my $content_nl = lc($content_n);
1300
- # Lower case
1301
- $features .= $content_l . "\t";
1302
- # Lower case, no punctuation
1303
- if ($content_nl ne "")
1304
- {
1305
- $features .= $content_nl . "\t";
1306
- }
1307
- else
1308
- {
1309
- $features .= $content_l . "\t";
1310
- }
1311
-
1312
- # Split into character
1313
- my @chars = split(//, $content);
1314
- # Content length
1315
- my $length = (scalar(@chars) == 1) ? "1-char" :
1316
- (scalar(@chars) == 2) ? "2-char" :
1317
- (scalar(@chars) == 3) ? "3-char" : "4+char";
1318
- $features .= $length . "\t";
1319
-
1320
- # First word in line
1321
- if ($is_first_line == 1)
1322
- {
1323
- $features .= "begin" . "\t";
1324
-
1325
- # Next words are not the first in line anymore
1326
- $is_first_line = 0;
1327
- }
1328
- else
1329
- {
1330
- $features .= "continue" . "\t";
1331
- }
1332
-
1333
- ###
1334
- # The following features are XML features
1335
- ###
1336
-
1337
- # Bold format
1338
- $features .= $bold . "\t";
1339
-
1340
- # Italic format
1341
- $features .= $italic . "\t";
1342
-
1343
- # Underline
1344
- $features .= $underline . "\t";
1345
-
1346
- # Sub-Sup-script
1347
- $features .= $suscript . "\t";
1348
-
1349
- # Relative font size
1350
- $features .= $fontsize . "\t";
1351
-
1352
- # First word in run
1353
- if (($prev_bold ne $bold) || ($prev_italic ne $italic) || ($prev_underline ne $underline) || ($prev_suscript ne $suscript) || ($prev_fontsize ne $fontsize))
1354
- {
1355
- $features .= "fbegin" . "\t";
1356
- }
1357
- else
1358
- {
1359
- $features .= "fcontinue" . "\t";
1360
- }
1361
-
1362
- # New token
1363
- $features .= "\n";
1364
-
1365
- # Save the XML format
1366
- $prev_bold = $bold;
1367
- $prev_italic = $italic;
1368
- $prev_underline = $underline;
1369
- $prev_suscript = $suscript;
1370
- $prev_fontsize = $fontsize;
1371
-
1372
- # NOTE: Relational classifier features
1373
- # Content
1374
- $rc_features .= $content . "\t";
1375
- # Location
1376
- $rc_features .= $top . "\t";
1377
- $rc_features .= $bottom . "\t";
1378
- $rc_features .= $left . "\t";
1379
- $rc_features .= $right . "\t";
1380
- # Index
1381
- if (! defined $size_mismatch)
1382
- {
1383
- $rc_features .= $aff_addrs->[ $counter ]->{ 'L1' } . "\t";
1384
- $rc_features .= $aff_addrs->[ $counter ]->{ 'L2' } . "\t";
1385
- $rc_features .= $aff_addrs->[ $counter ]->{ 'L3' } . "\t";
1386
- $rc_features .= $aff_addrs->[ $counter ]->{ 'L4' } . "\t";
1387
- }
1388
- # Done
1389
- $rc_features .= "\n";
1390
- }
1391
- }
1392
- }
1393
- }
1394
-
1395
- return ($features, $rc_features);
1396
-
1397
- }
1398
-
1399
- # Extract features from author lines
1400
- # The list of features include
1401
- # Content
1402
- # Content, lower case, no punctuation
1403
- # Content length
1404
- # Capitalization
1405
- # Numeric property
1406
- # Last punctuation
1407
- # First 4-gram
1408
- # Last 4-gram
1409
- # Dictionary
1410
- # First word in line
1411
- #
1412
- # XML features
1413
- # Subscript, superscript
1414
- # Bold
1415
- # Italic
1416
- # Underline
1417
- # Relative font size
1418
- # Differentiate features
1419
- sub AuthorFeatureExtraction
1420
- {
1421
- my ($aut_lines, $aut_addrs) = @_;
1422
-
1423
- # NOTE: Relational classifier features
1424
- my $rc_features = "";
1425
-
1426
- # Features will be stored here
1427
- my $features = "";
1428
- # First word in line
1429
- my $is_first_line = undef;
1430
- # First word in run
1431
- # my $is_first_run = undef;
1432
-
1433
- # Font size
1434
- my %fonts = ();
1435
- # Each line contains many runs
1436
- foreach my $line (@{ $aut_lines })
1437
- {
1438
- my $runs = $line->get_objs_ref();
1439
- # Iterator though all work in all lines
1440
- foreach my $run (@{ $runs })
1441
- {
1442
- my $fsize = $run->get_font_size();
1443
- my $words = $run->get_objs_ref();
1444
-
1445
- # Statistic
1446
- if (! exists $fonts{ $fsize })
1447
- {
1448
- $fonts{ $fsize } = scalar(@{ $words });
1449
- }
1450
- else
1451
- {
1452
- $fonts{ $fsize } += scalar(@{ $words });
1453
- }
1454
- }
1455
- }
1456
-
1457
- my $dominate_font = undef;
1458
- # Sort all the font descend with the number of their appearance
1459
- my @sorted = sort { $fonts{ $b } <=> $fonts{ $a } } keys %fonts;
1460
- # Select the dominated font
1461
- $dominate_font = $sorted[ 0 ];
1462
-
1463
- my $size_mismatch = undef;
1464
- # TODO: serious error if the size of aut_lines and the size of aut_addrs mismatch
1465
- if (scalar(@{ $aut_lines }) != scalar(@{ $aut_addrs }))
1466
- {
1467
- $size_mismatch = 1;
1468
- # Print the error but still try to continue
1469
- print STDERR "# Total number of author lines (" . scalar(@{ $aut_lines }) . ") != Total number of author addresses (" . scalar(@{ $aut_addrs }) . ")." . "\n";
1470
- }
1471
-
1472
- my $prev_page = undef;
1473
- my $prev_sect = undef;
1474
- my $prev_para = undef;
1475
- # Each line contains many runs
1476
- for (my $counter = 0; $counter < scalar(@{ $aut_lines }); $counter++)
1477
- {
1478
- # Get the line object
1479
- my $line = $aut_lines->[ $counter ];
1480
-
1481
- # Check the size of aut_line and aut_addrs
1482
- if (! defined $size_mismatch)
1483
- {
1484
- # Check if two consecutive lines are from two different sections
1485
- if (! defined $prev_page)
1486
- {
1487
- # Init
1488
- $prev_page = $aut_addrs->[ $counter ]->{ 'L1' };
1489
- $prev_sect = $aut_addrs->[ $counter ]->{ 'L2' };
1490
- $prev_para = $aut_addrs->[ $counter ]->{ 'L3' };
1491
- }
1492
- else
1493
- {
1494
- # Authors from different sections will be separated immediately
1495
- if (($prev_page != $aut_addrs->[ $counter ]->{ 'L1' }) ||
1496
- ($prev_sect != $aut_addrs->[ $counter ]->{ 'L2' }) ||
1497
- ($prev_para != $aut_addrs->[ $counter ]->{ 'L3' }))
1498
- {
1499
- $features .= "\n";
1500
-
1501
- # NOTE: Relational classifier features
1502
- $rc_features .= "\n";
1503
- }
1504
-
1505
- # Save the paragraph index
1506
- $prev_page = $aut_addrs->[ $counter ]->{ 'L1' };
1507
- $prev_sect = $aut_addrs->[ $counter ]->{ 'L2' };
1508
- $prev_para = $aut_addrs->[ $counter ]->{ 'L3' };
1509
- }
1510
- }
1511
-
1512
- # Set first word in line
1513
- $is_first_line = 1;
1514
-
1515
- # Previous word and the word before this
1516
- my $prev_prev_word = undef;
1517
- my $prev_word = undef;
1518
-
1519
- # Format of the previous word
1520
- my ($prev_bold, $prev_italic, $prev_underline, $prev_suscript, $prev_fontsize) = "unknown";
1521
-
1522
- my $runs = $line->get_objs_ref();
1523
- # Iterator though all work in all lines
1524
- foreach my $run (@{ $runs })
1525
- {
1526
- # The run must be non-empty
1527
- my $tmp = $run->get_content();
1528
- # Trim
1529
- $tmp =~ s/^\s+|\s+$//g;
1530
- # Skip blank run
1531
- if ($tmp eq "") { next; }
1532
-
1533
- # Set first word in run
1534
- # $is_first_run = 1;
1535
-
1536
- ###
1537
- # The following features are XML features
1538
- ###
1539
-
1540
- # Bold format
1541
- my $bold = ($run->get_bold() eq "true") ? "bold" : "none";
1542
-
1543
- # Italic format
1544
- my $italic = ($run->get_italic() eq "true") ? "italic" : "none";
1545
-
1546
- # Underline
1547
- my $underline = ($run->get_underline() eq "true") ? "underline" : "none";
1548
-
1549
- # Sub-Sup-script
1550
- my $suscript = ($run->get_suscript() eq "superscript") ? "super" :
1551
- ($run->get_suscript() eq "subscript") ? "sub" : "none";
1552
-
1553
- # Relative font size
1554
- my $fontsize = ($run->get_font_size() > $dominate_font) ? "large" :
1555
- ($run->get_font_size() < $dominate_font) ? "small" : "normal";
1556
-
1557
- ###
1558
- # End of XML features
1559
- ###
1560
-
1561
- # All words in the run
1562
- my $words = $run->get_objs_ref();
1563
-
1564
- # For each word
1565
- foreach my $word (@{ $words })
1566
- {
1567
- # Get word location
1568
- my $top = $word->get_top_pos();
1569
- my $bottom = $word->get_bottom_pos();
1570
- my $left = $word->get_left_pos();
1571
- my $right = $word->get_right_pos();
1572
-
1573
- # NOTE: heuristic rule, for words in the same line
1574
- # If the x-axis distance between this word and the previous word is
1575
- # three times larger than the distance between the previous word and
1576
- # the word before it, then it marks the separator.
1577
- # The better way to do this is to introduce it as a new feature in the
1578
- # author and affiliation model but this step requires re-training these
1579
- # two models, so ...
1580
- #
1581
- # NOTE: Assuming left to right writing
1582
- if (! defined $prev_word)
1583
- {
1584
- $prev_word = $word;
1585
- }
1586
- elsif (! defined $prev_prev_word)
1587
- {
1588
- # NOTE: Words have the power to both destroy and heal, when words are both
1589
- # true and kind, they can change our world
1590
- if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
1591
- {
1592
- $prev_prev_word = $prev_word;
1593
- $prev_word = $word;
1594
- }
1595
- }
1596
- else
1597
- {
1598
- # NOTE: Words have the power to both destroy and heal, when words are both
1599
- # true and kind, they can change our world
1600
- if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
1601
- {
1602
-
1603
- my $prev_dist = abs ($prev_word->get_left_pos() - $prev_prev_word->get_right_pos());
1604
- my $curr_dist = abs ($word->get_left_pos() - $prev_word->get_right_pos());
1605
-
1606
- if ($prev_dist * 5 < $curr_dist)
1607
- {
1608
- $features .= "\n";
1609
-
1610
- # NOTE: Relational classifier features
1611
- $rc_features .= "\n";
1612
- }
1613
-
1614
- $prev_prev_word = $prev_word;
1615
- $prev_word = $word;
1616
- }
1617
- }
1618
-
1619
- # Extract features
1620
- my $full_content = $word->get_content();
1621
- # Trim
1622
- $full_content =~ s/^\s+|\s+$//g;
1623
-
1624
- # Skip blank run
1625
- if ($full_content eq "") { next; }
1626
-
1627
- my @sub_content = ();
1628
- # This is the tricky part, one word e.g. name** will be splitted
1629
- # into several parts: the name, the signal, and the separator if
1630
- # possible using regular expression
1631
- while ($full_content =~ m/([\w|-]*)(\W*)/g)
1632
- {
1633
- my $first = $1;
1634
- my $second = $2;
1635
-
1636
- # Trim
1637
- $first =~ s/^\s+|\s+$//g;
1638
- $second =~ s/^\s+|\s+$//g;
1639
-
1640
- # Only keep non-blank content
1641
- if ($first ne "") { push @sub_content, $first; }
1642
-
1643
- # Check the signal and separator
1644
- while ($second =~ m/([,|\.|:|;]*)([^,\.:;]*)/g)
1645
- {
1646
- my $sub_first = $1;
1647
- my $sub_second = $2;
1648
-
1649
- # Trim
1650
- $sub_first =~ s/^\s+|\s+$//g;
1651
- $sub_second =~ s/^\s+|\s+$//g;
1652
-
1653
- # Only keep non-blank separator
1654
- if ($sub_first ne "") { push @sub_content, $sub_first; }
1655
- # Only keep non-blank signal
1656
- if ($sub_second ne "") { push @sub_content, $sub_second; }
1657
- }
1658
- }
1659
-
1660
- foreach my $content (@sub_content)
1661
- {
1662
- # Content
1663
- $features .= $content . "\t";
1664
-
1665
- my $content_n = $content;
1666
- # Remove punctuation
1667
- $content_n =~ s/[^\w]//g;
1668
- # Lower case
1669
- my $content_l = lc($content);
1670
- # Lower case, no punctuation
1671
- my $content_nl = lc($content_n);
1672
- # Lower case
1673
- $features .= $content_l . "\t";
1674
- # Lower case, no punctuation
1675
- if ($content_nl ne "")
1676
- {
1677
- $features .= $content_nl . "\t";
1678
- }
1679
- else
1680
- {
1681
- $features .= $content_l . "\t";
1682
- }
1683
-
1684
- # Capitalization
1685
- my $ortho = ($content =~ /^[\p{IsUpper}]$/) ? "single" :
1686
- ($content =~ /^[\p{IsUpper}][\p{IsLower}]+/) ? "init" :
1687
- ($content =~ /^[\p{IsUpper}]+$/) ? "all" : "others";
1688
- $features .= $ortho . "\t";
1689
-
1690
- # Numeric property
1691
- my $num = ($content =~ /^[0-9]$/) ? "1dig" :
1692
- ($content =~ /^[0-9][0-9]$/) ? "2dig" :
1693
- ($content =~ /^[0-9][0-9][0-9]$/) ? "3dig" :
1694
- ($content =~ /^[0-9]+$/) ? "4+dig" :
1695
- ($content =~ /^[0-9]+(th|st|nd|rd)$/) ? "ordinal" :
1696
- ($content =~ /[0-9]/) ? "hasdig" : "nonnum";
1697
- $features .= $num . "\t";
1698
-
1699
- # Last punctuation
1700
- my $punct = ($content =~ /^[\"\'\`]/) ? "leadq" :
1701
- ($content =~ /[\"\'\`][^s]?$/) ? "endq" :
1702
- ($content =~ /\-.*\-/) ? "multi" :
1703
- ($content =~ /[\-\,\:\;]$/) ? "cont" :
1704
- ($content =~ /[\!\?\.\"\']$/) ? "stop" :
1705
- ($content =~ /^[\(\[\{\<].+[\)\]\}\>].?$/) ? "braces" : "others";
1706
- $features .= $punct . "\t";
1707
-
1708
- # Split into character
1709
- my @chars = split(//, $content);
1710
- my $clen = scalar @chars;
1711
- # Content length
1712
- my $length = (scalar(@chars) == 1) ? "1-char" :
1713
- (scalar(@chars) == 2) ? "2-char" :
1714
- (scalar(@chars) == 3) ? "3-char" : "4+char";
1715
- $features .= $length . "\t";
1716
- # First n-gram
1717
- $features .= $chars[ 0 ] . "\t";
1718
- if ($clen >= 2) {
1719
- $features .= join("", @chars[ 0..1 ]) . "\t";
1720
- } else {
1721
- $features .= $length . "\t";
1722
- }
1723
- if ($clen >= 3) {
1724
- $features .= join("", @chars[ 0..2 ]) . "\t";
1725
- } elsif ($clen >= 2) {
1726
- $features .= join("", @chars[ 0..1 ]) . "\t";
1727
- } else {
1728
- $features .= $length . "\t";
1729
- }
1730
- if ($clen >= 4) {
1731
- $features .= join("", @chars[ 0..3 ]) . "\t";
1732
- } elsif ($clen >= 3) {
1733
- $features .= join("", @chars[ 0..2 ]) . "\t";
1734
- } elsif ($clen >= 2) {
1735
- $features .= join("", @chars[ 0..1 ]) . "\t";
1736
- } else {
1737
- $features .= $length . "\t";
1738
- }
1739
- # Last n-gram
1740
- $features .= $chars[ -1 ] . "\t";
1741
- if ($clen >= 2) {
1742
- $features .= join("", @chars[ -2..-1 ]) . "\t";
1743
- } else {
1744
- $features .= $chars[ -1 ] . "\t";
1745
- }
1746
- if ($clen >= 3) {
1747
- $features .= join("", @chars[ -3..-1 ]) . "\t";
1748
- } elsif ($clen >= 2) {
1749
- $features .= join("", @chars[ -2..-1 ]) . "\t";
1750
- } else {
1751
- $features .= $chars[ -1 ] . "\t";
1752
- }
1753
- if ($clen >= 4) {
1754
- $features .= join("", @chars[ -4..-1 ]) . "\t";
1755
- } elsif ($clen >= 3) {
1756
- $features .= join("", @chars[ -3..-1 ]) . "\t";
1757
- } elsif ($clen >= 2) {
1758
- $features .= join("", @chars[ -2..-1 ]) . "\t";
1759
- } else {
1760
- $features .= $chars[ -1 ] . "\t";
1761
- }
1762
-
1763
- # Dictionary
1764
- my $dict_status = (defined $dict{ $content_nl }) ? $dict{ $content_nl } : 0;
1765
- # Possible names
1766
- my ($publisher_name, $place_name, $month_name, $last_name, $female_name, $male_name) = undef;
1767
- # Check all case
1768
- if ($dict_status >= 32) { $dict_status -= 32; $publisher_name = "publisher" } else { $publisher_name = "no"; }
1769
- if ($dict_status >= 16) { $dict_status -= 16; $place_name = "place" } else { $place_name = "no"; }
1770
- if ($dict_status >= 8) { $dict_status -= 8; $month_name = "month" } else { $month_name = "no"; }
1771
- if ($dict_status >= 4) { $dict_status -= 4; $last_name = "last" } else { $last_name = "no"; }
1772
- if ($dict_status >= 2) { $dict_status -= 2; $female_name = "female" } else { $female_name = "no"; }
1773
- if ($dict_status >= 1) { $dict_status -= 1; $male_name = "male" } else { $male_name = "no"; }
1774
- # Save the feature
1775
- $features .= $male_name . "\t";
1776
- $features .= $female_name . "\t";
1777
- $features .= $last_name . "\t";
1778
- $features .= $month_name . "\t";
1779
- $features .= $place_name . "\t";
1780
- $features .= $publisher_name . "\t";
1781
-
1782
- # First word in line
1783
- if ($is_first_line == 1)
1784
- {
1785
- $features .= "begin" . "\t";
1786
-
1787
- # Next words are not the first in line anymore
1788
- $is_first_line = 0;
1789
- }
1790
- else
1791
- {
1792
- $features .= "continue" . "\t";
1793
- }
1794
-
1795
- ###
1796
- # The following features are XML features
1797
- ###
1798
-
1799
- # Bold format
1800
- $features .= $bold . "\t";
1801
-
1802
- # Italic format
1803
- $features .= $italic . "\t";
1804
-
1805
- # Underline
1806
- $features .= $underline . "\t";
1807
-
1808
- # Sub-Sup-script
1809
- $features .= $suscript . "\t";
1810
-
1811
- # Relative font size
1812
- $features .= $fontsize . "\t";
1813
-
1814
- # First word in run
1815
- if (($prev_bold ne $bold) || ($prev_italic ne $italic) || ($prev_underline ne $underline) || ($prev_suscript ne $suscript) || ($prev_fontsize ne $fontsize))
1816
- {
1817
- $features .= "fbegin" . "\t";
1818
-
1819
- # Next words are not the first in line anymore
1820
- # $is_first_run = 0;
1821
- }
1822
- else
1823
- {
1824
- $features .= "fcontinue" . "\t";
1825
- }
1826
-
1827
- # New token
1828
- $features .= "\n";
1829
-
1830
- # Save the XML format
1831
- $prev_bold = $bold;
1832
- $prev_italic = $italic;
1833
- $prev_underline = $underline;
1834
- $prev_suscript = $suscript;
1835
- $prev_fontsize = $fontsize;
1836
-
1837
- # NOTE: Relational classifier features
1838
- # Content
1839
- $rc_features .= $content . "\t";
1840
- # Location
1841
- $rc_features .= $top . "\t";
1842
- $rc_features .= $bottom . "\t";
1843
- $rc_features .= $left . "\t";
1844
- $rc_features .= $right . "\t";
1845
- # Index
1846
- if (! defined $size_mismatch)
1847
- {
1848
- $rc_features .= $aut_addrs->[ $counter ]->{ 'L1' } . "\t";
1849
- $rc_features .= $aut_addrs->[ $counter ]->{ 'L2' } . "\t";
1850
- $rc_features .= $aut_addrs->[ $counter ]->{ 'L3' } . "\t";
1851
- $rc_features .= $aut_addrs->[ $counter ]->{ 'L4' } . "\t";
1852
- }
1853
- # Done
1854
- $rc_features .= "\n";
1855
- }
1856
- }
1857
- }
1858
- }
1859
-
1860
- return ($features, $rc_features);
1861
- }
1862
-
1863
- sub ReadDict
1864
- {
1865
- my ($dictfile) = @_;
1866
-
1867
- # Absolute path
1868
- my $dictfile_abs = File::Spec->rel2abs($dictfile);
1869
- # Dictionary handle
1870
- my $dict_handle = undef;
1871
- open ($dict_handle, "<:utf8", $dictfile_abs) || die "Could not open dict file $dictfile_abs: $!";
1872
-
1873
- my $mode = 0;
1874
- while (<$dict_handle>)
1875
- {
1876
- if (/^\#\# Male/) { $mode = 1; } # male names
1877
- elsif (/^\#\# Female/) { $mode = 2; } # female names
1878
- elsif (/^\#\# Last/) { $mode = 4; } # last names
1879
- elsif (/^\#\# Chinese/) { $mode = 4; } # last names
1880
- elsif (/^\#\# Months/) { $mode = 8; } # month names
1881
- elsif (/^\#\# Place/) { $mode = 16; } # place names
1882
- elsif (/^\#\# Publisher/) { $mode = 32; } # publisher names
1883
- elsif (/^\#/) { next; }
1884
- else
1885
- {
1886
- chop;
1887
- my $key = $_;
1888
- my $val = 0;
1889
- # Has probability
1890
- if (/\t/) { ($key,$val) = split (/\t/,$_); }
1891
-
1892
- # Already tagged (some entries may appear in same part of lexicon more than once
1893
- if (! exists $dict{ $key })
1894
- {
1895
- $dict{ $key } = $mode;
1896
- }
1897
- else
1898
- {
1899
- if ($dict{ $key } >= $mode)
1900
- {
1901
- next;
1902
- }
1903
- # Not yet tagged
1904
- else
1905
- {
1906
- $dict{ $key } += $mode;
1907
- }
1908
- }
1909
- }
1910
- }
1911
-
1912
- close ($dict_handle);
1913
- }
1914
-
1915
- sub BuildTmpFile
1916
- {
1917
- my ($filename) = @_;
1918
-
1919
- my $tmpfile = $filename;
1920
- $tmpfile =~ s/[\.\/]//g;
1921
- $tmpfile .= $$ . time;
1922
-
1923
- # Untaint tmpfile variable
1924
- if ($tmpfile =~ /^([-\@\w.]+)$/)
1925
- {
1926
- $tmpfile = $1;
1927
- }
1928
-
1929
- return "/tmp/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
1930
- }
1931
-
1932
- 1;
1933
-
1934
-
1935
-
1936
-
1937
-
1938
-
1939
-
1940
-
1941
-
1942
-
1943
-
1944
-
1945
-
1946
-
1947
-
1948
-
1949
-