opener-tokenizer-base 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +148 -0
- data/bin/tokenizer-base +5 -0
- data/bin/tokenizer-de +5 -0
- data/bin/tokenizer-en +5 -0
- data/bin/tokenizer-es +5 -0
- data/bin/tokenizer-fr +5 -0
- data/bin/tokenizer-it +5 -0
- data/bin/tokenizer-nl +5 -0
- data/core/lib/Data/OptList.pm +256 -0
- data/core/lib/Params/Util.pm +866 -0
- data/core/lib/Sub/Exporter.pm +1101 -0
- data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
- data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
- data/core/lib/Sub/Exporter/Util.pm +354 -0
- data/core/lib/Sub/Install.pm +329 -0
- data/core/lib/Time/Stamp.pm +808 -0
- data/core/load-prefixes.pl +43 -0
- data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
- data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
- data/core/split-sentences.pl +114 -0
- data/core/text-fixer.pl +169 -0
- data/core/tokenizer-cli.pl +363 -0
- data/core/tokenizer.pl +145 -0
- data/lib/opener/tokenizers/base.rb +84 -0
- data/lib/opener/tokenizers/base/version.rb +8 -0
- data/opener-tokenizer-base.gemspec +25 -0
- metadata +134 -0
@@ -0,0 +1,114 @@
|
|
1
|
+
#!/usr/bin/perl -w
|
2
|
+
|
3
|
+
# Based on Preprocessor written by Philipp Koehn
|
4
|
+
# Changed by aazpeitia (aazpeitia@vicomtech.org)
|
5
|
+
|
6
|
+
use strict;
|
7
|
+
|
8
|
+
my %NONBREAKING_PREFIX = ();
|
9
|
+
my $LANGUAGE;
|
10
|
+
|
11
|
+
sub init_sentence_splitter {
|
12
|
+
$LANGUAGE = shift(@_);
|
13
|
+
%NONBREAKING_PREFIX = %{ shift(@_) };
|
14
|
+
}
|
15
|
+
|
16
|
+
sub split_sentences {
|
17
|
+
|
18
|
+
my $input_text = shift(@_);
|
19
|
+
chomp($input_text);
|
20
|
+
my $text = "";
|
21
|
+
if (/^<.+>$/ || /^\s*$/) {
|
22
|
+
#time to process this block, we've hit a blank or <p>
|
23
|
+
#&do_it_for($text,$input_text);
|
24
|
+
#print "<P>\n" if (/^\s*$/ && $text); ##if we have text followed by <P>
|
25
|
+
#$text = "";
|
26
|
+
$text .= &do_it_for($text,$input_text);
|
27
|
+
$text .= "<P>\n" if (/^\s*$/ && $text); ##if we have text followed by <P>
|
28
|
+
}
|
29
|
+
else {
|
30
|
+
#append the text, with a space
|
31
|
+
$text .= $input_text. " ";
|
32
|
+
}
|
33
|
+
$text = &do_it_for($text,$input_text);
|
34
|
+
return split("\n", $text);
|
35
|
+
}
|
36
|
+
|
37
|
+
sub do_it_for {
|
38
|
+
my($text,$markup) = @_;
|
39
|
+
$text = &preprocess($text) if $text;
|
40
|
+
#print "$markup\n" if ($markup =~ /^<.+>$/);
|
41
|
+
return $text;
|
42
|
+
}
|
43
|
+
|
44
|
+
sub preprocess {
|
45
|
+
#this is one paragraph
|
46
|
+
my($text) = @_;
|
47
|
+
|
48
|
+
# clean up spaces at head and tail of each line as well as any double-spacing
|
49
|
+
$text =~ s/ +/ /g;
|
50
|
+
$text =~ s/\n /\n/g;
|
51
|
+
$text =~ s/ \n/\n/g;
|
52
|
+
$text =~ s/^ //g;
|
53
|
+
$text =~ s/ $//g;
|
54
|
+
|
55
|
+
#####add sentence breaks as needed#####
|
56
|
+
#non-period end of sentence markers (?!) followed by sentence starters.
|
57
|
+
|
58
|
+
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
|
59
|
+
#$text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
|
60
|
+
$text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\w])/$1\n$2/g;
|
61
|
+
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
62
|
+
|
63
|
+
#multi-dots followed by sentence starters
|
64
|
+
$text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
|
65
|
+
|
66
|
+
# add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case
|
67
|
+
$text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g;
|
68
|
+
|
69
|
+
# add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case
|
70
|
+
$text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
|
71
|
+
|
72
|
+
# special punctuation cases are covered. Check all remaining periods.
|
73
|
+
my $word;
|
74
|
+
my $i;
|
75
|
+
my @words = split(/ /,$text);
|
76
|
+
$text = "";
|
77
|
+
for ($i=0;$i<(scalar(@words)-1);$i++) {
|
78
|
+
if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) {
|
79
|
+
#check if $1 is a known honorific and $2 is empty, never break
|
80
|
+
my $prefix = $1;
|
81
|
+
my $starting_punct = $2;
|
82
|
+
if($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
|
83
|
+
|
84
|
+
#not breaking;
|
85
|
+
} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
|
86
|
+
#not breaking - upper case acronym
|
87
|
+
} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) {
|
88
|
+
#the next word has a bunch of initial quotes, maybe a space, then either upper case or a number
|
89
|
+
$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
|
90
|
+
#we always add a return for these unless we have a numeric non-breaker and a number start
|
91
|
+
}
|
92
|
+
|
93
|
+
}
|
94
|
+
$text = $text.$words[$i]." ";
|
95
|
+
}
|
96
|
+
|
97
|
+
#we stopped one token from the end to allow for easy look-ahead. Append it now.
|
98
|
+
$text = $text.$words[$i];
|
99
|
+
|
100
|
+
# clean up spaces at head and tail of each line as well as any double-spacing
|
101
|
+
$text =~ s/ +/ /g;
|
102
|
+
$text =~ s/\n /\n/g;
|
103
|
+
$text =~ s/ \n/\n/g;
|
104
|
+
$text =~ s/^ //g;
|
105
|
+
$text =~ s/ $//g;
|
106
|
+
|
107
|
+
#add trailing break
|
108
|
+
$text .= "\n" unless $text =~ /\n$/;
|
109
|
+
|
110
|
+
return $text;
|
111
|
+
|
112
|
+
}
|
113
|
+
|
114
|
+
1;
|
data/core/text-fixer.pl
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
#!/usr/bin/perl -w
|
2
|
+
|
3
|
+
# reads input text and fixes some mistakes
|
4
|
+
# developed by Andoni Azpeitia
|
5
|
+
|
6
|
+
use utf8;
|
7
|
+
|
8
|
+
my %NONBREAKING_PREFIX = ();
|
9
|
+
my $LANGUAGE;
|
10
|
+
|
11
|
+
my $START_QUOTES_REGEX = "“|‘|«|‹";
|
12
|
+
my $END_QUOTES_REGEX = "”|’|»|›";
|
13
|
+
|
14
|
+
sub init_text_fixer {
|
15
|
+
$LANGUAGE = shift(@_);
|
16
|
+
%NONBREAKING_PREFIX = %{ shift(@_) };
|
17
|
+
}
|
18
|
+
|
19
|
+
sub fix_text {
|
20
|
+
|
21
|
+
my($text) = shift(@_);
|
22
|
+
|
23
|
+
chomp($text);
|
24
|
+
|
25
|
+
#fix encoding
|
26
|
+
$text = &fix_encoding($text);
|
27
|
+
|
28
|
+
|
29
|
+
#word token method
|
30
|
+
my @words = split(/\s/,$text);
|
31
|
+
$text = "";
|
32
|
+
for (my $i=0;$i<(scalar(@words));$i++) {
|
33
|
+
my $word = $words[$i];
|
34
|
+
#Kumi Naidoo said: “bla bla bla.”Bla bla => Kumi Naidoo said: “bla bla bla”. Bla bla
|
35
|
+
|
36
|
+
if ( $word =~ /^(\S+)\.($END_QUOTES_REGEX)($START_QUOTES_REGEX*\p{IsUpper}\S*)$/ ) {
|
37
|
+
my $pre = $1;
|
38
|
+
my $quote = $2;
|
39
|
+
my $post = $3;
|
40
|
+
|
41
|
+
$word = $pre.$quote.". ".$post;
|
42
|
+
}
|
43
|
+
#to a "breach of trust." A German => to a "breach of trust". A German
|
44
|
+
elsif ( $word =~ /^(\S+)\.($END_QUOTES_REGEX)$/ ) {
|
45
|
+
my $pre = $1;
|
46
|
+
my $quote = $2;
|
47
|
+
if ( ($i<scalar(@words)-1 && $words[$i+1] =~ /^$START_QUOTES_REGEX*\p{IsUpper}\S*$/ )) {
|
48
|
+
$word = $pre.$quote.".";
|
49
|
+
}
|
50
|
+
elsif ($i==scalar(@words)-1) {
|
51
|
+
$word = $pre.$quote.".";
|
52
|
+
}
|
53
|
+
}
|
54
|
+
#OpeNER is amazing.OpeNER is cool. => OpeNER is amazing. OpeNER is cool.
|
55
|
+
elsif ( $word =~ /^(\S+)\.(\S+)$/) {
|
56
|
+
my $pre = $1;
|
57
|
+
my $post = $2;
|
58
|
+
if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($post =~ /^[\p{IsLower}]/) ) {
|
59
|
+
#no change
|
60
|
+
} elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($post =~ /^[0-9]+/) ) {
|
61
|
+
#no change
|
62
|
+
} else {
|
63
|
+
$word = $pre.". ".$post;
|
64
|
+
}
|
65
|
+
}
|
66
|
+
#OpeNER is amazing .OpeNER is cool. => OpeNER is amazing. OpeNER is cool.
|
67
|
+
elsif ( $word =~ /^\.(\p{IsUpper}\S+)$/ ) {
|
68
|
+
my $post = $1;
|
69
|
+
if ( $i>0 && $words[$i-1] =~ /^(\S+)$/) {
|
70
|
+
$word = ". ".$post;
|
71
|
+
}
|
72
|
+
}
|
73
|
+
$text .= $word." ";
|
74
|
+
}
|
75
|
+
#freedoms." 'Outrageous'Although => freedoms". 'Outrageous' Although
|
76
|
+
#$text =~ s/(\")([^\"]+)(\. ?)(\")/$1$2$4$3/g;
|
77
|
+
#$text =~ s/(\')([^\']+)(\. ?)(\')/$1$2$4$3/g;
|
78
|
+
return $text;
|
79
|
+
}
|
80
|
+
|
81
|
+
sub fix_encoding {
|
82
|
+
|
83
|
+
my $text = shift(@_);
|
84
|
+
|
85
|
+
$text =~ s/â/'/g;
|
86
|
+
$text =~ s/Ã/À/g;
|
87
|
+
$text =~ s//“/g;
|
88
|
+
$text =~ s///g;
|
89
|
+
$text =~ s//\"/g;
|
90
|
+
############################################
|
91
|
+
############################################
|
92
|
+
$text =~ s/…/…/g; # elipsis
|
93
|
+
$text =~ s/â¦/…/g; # elipsis
|
94
|
+
$text =~ s/–/–/g; # long hyphen
|
95
|
+
$text =~ s/’/’/g; #curly apostrophe
|
96
|
+
$text =~ s/“/“/g; # curly open quote
|
97
|
+
$text =~ s/â€/”/g; # curly close quote
|
98
|
+
$text =~ s/»/»/g;
|
99
|
+
$text =~ s/«/«/g;
|
100
|
+
############################################
|
101
|
+
$text =~ s/á/á/g;
|
102
|
+
$text =~ s/é/é/g;
|
103
|
+
$text =~ s/Ã\*/í/g;
|
104
|
+
$text =~ s/ó/ó/g;
|
105
|
+
$text =~ s/ú/ú/g;
|
106
|
+
|
107
|
+
$text =~ s/Ã/Á/g;
|
108
|
+
$text =~ s/É/É/g;
|
109
|
+
$text =~ s/Ã/Í/g;
|
110
|
+
$text =~ s/Ó/Ó/g;
|
111
|
+
$text =~ s/Ú/Ú/g;
|
112
|
+
############################################
|
113
|
+
$text =~ s/ñ/ñ/g;
|
114
|
+
$text =~ s/ç/ç/g;
|
115
|
+
$text =~ s/Å“/œ/g;
|
116
|
+
|
117
|
+
$text =~ s/Ñ/Ñ/g;
|
118
|
+
$text =~ s/Ç/Ç/g;
|
119
|
+
$text =~ s/Å’/Œ/g;
|
120
|
+
############################################
|
121
|
+
$text =~ s/©/©/g;
|
122
|
+
$text =~ s/®/®/g;
|
123
|
+
$text =~ s/â„¢/™/g;
|
124
|
+
$text =~ s/Ø/Ø/g;
|
125
|
+
$text =~ s/ª/ª/g;
|
126
|
+
############################################
|
127
|
+
$text =~ s/ä/ä/g;
|
128
|
+
$text =~ s/ë/ë/g;
|
129
|
+
$text =~ s/ï/ï/g;
|
130
|
+
$text =~ s/ö/ö/g;
|
131
|
+
$text =~ s/ü/ü/g;
|
132
|
+
|
133
|
+
$text =~ s/Ä/Ä/g;
|
134
|
+
$text =~ s/Ë/Ë/g;
|
135
|
+
$text =~ s/Ã /Ï/g;
|
136
|
+
$text =~ s/Ö /Ö/g;
|
137
|
+
$text =~ s/Ü/Ü/g;
|
138
|
+
############################################
|
139
|
+
$text =~ s/Ã /à/g;
|
140
|
+
$text =~ s/è/è/g;
|
141
|
+
$text =~ s/ì/ì/g;
|
142
|
+
$text =~ s/ò/ò/g;
|
143
|
+
$text =~ s/ù/ù/g;
|
144
|
+
|
145
|
+
$text =~ s/À/À/g;
|
146
|
+
$text =~ s/È/È/g;
|
147
|
+
$text =~ s/ÃŒ/Ì/g;
|
148
|
+
$text =~ s/Ã’/Ò/g;
|
149
|
+
$text =~ s/Ù/Ù/g;
|
150
|
+
############################################
|
151
|
+
$text =~ s/â/â/g;
|
152
|
+
$text =~ s/ê/ê/g;
|
153
|
+
$text =~ s/î/î/g;
|
154
|
+
$text =~ s/ô/ô/g;
|
155
|
+
$text =~ s/û/û/g;
|
156
|
+
|
157
|
+
$text =~ s/Â/Â/g;
|
158
|
+
$text =~ s/Ê/Ê/g;
|
159
|
+
$text =~ s/ÃŽ/Î/g;
|
160
|
+
$text =~ s/Ô/Ô/g;
|
161
|
+
$text =~ s/Û/Û/g;
|
162
|
+
############################################
|
163
|
+
$text =~ s/Ã/E/g;
|
164
|
+
|
165
|
+
|
166
|
+
return $text;
|
167
|
+
}
|
168
|
+
|
169
|
+
1;
|
@@ -0,0 +1,363 @@
|
|
1
|
+
#!/usr/bin/perl -w
|
2
|
+
|
3
|
+
# Sample Tokenizer
|
4
|
+
# written by Josh Schroeder, based on code by Philipp Koehn
|
5
|
+
# changed by Haritz Arzelus (#2012/11/19) Aitor García and Andoni Azpeitia
|
6
|
+
|
7
|
+
use FindBin;
|
8
|
+
|
9
|
+
use lib "$FindBin::Bin/lib";
|
10
|
+
|
11
|
+
use Encode::Guess;
|
12
|
+
use Time::Stamp;
|
13
|
+
|
14
|
+
require "$FindBin::Bin"."/text-fixer.pl";
|
15
|
+
require "$FindBin::Bin"."/split-sentences.pl";
|
16
|
+
require "$FindBin::Bin"."/tokenizer.pl";
|
17
|
+
require "$FindBin::Bin"."/load-prefixes.pl";
|
18
|
+
|
19
|
+
no warnings;
|
20
|
+
use encoding 'utf8';
|
21
|
+
|
22
|
+
binmode(STDIN, ":utf8");
|
23
|
+
binmode(STDOUT, ":utf8");
|
24
|
+
binmode(STDERR, ":utf8");
|
25
|
+
|
26
|
+
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
|
27
|
+
#use FindBin qw($Bin);
|
28
|
+
#use strict;
|
29
|
+
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
30
|
+
#use Time::HiRes;
|
31
|
+
|
32
|
+
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
|
33
|
+
#my $mydir = "$Bin/nonbreaking_prefixes";
|
34
|
+
#changed by me (aitor) to point to the directory of the script, instead of current working directory
|
35
|
+
#my $mydir = "$FindBin::Bin"."/nonbreaking_prefixes";
|
36
|
+
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
37
|
+
|
38
|
+
#my $start = [ Time::HiRes::gettimeofday( ) ];
|
39
|
+
|
40
|
+
#while (@ARGV) {
|
41
|
+
# $_ = shift;
|
42
|
+
# /^-l$/ && ($language = shift, next);
|
43
|
+
# /^-q$/ && ($QUIET = 1, next);
|
44
|
+
# /^-h$/ && ($HELP = 1, next);
|
45
|
+
#}
|
46
|
+
|
47
|
+
#if (!$QUIET) {
|
48
|
+
# print STDERR "Tokenizer v3\n";
|
49
|
+
# print STDERR "Language: $language\n";
|
50
|
+
#}
|
51
|
+
|
52
|
+
#argument variables
|
53
|
+
my %NONBREAKING_PREFIX = ();
|
54
|
+
my $SENT_VERSION = "0.0.1";
|
55
|
+
my $TOK_VERSION = "1.0.1";
|
56
|
+
my $FILE = "";
|
57
|
+
my $LANGUAGE;
|
58
|
+
my $NOTIMESTAMP = 0;
|
59
|
+
my $HELP = 0;
|
60
|
+
|
61
|
+
my $SUBSTITUTE = "####";
|
62
|
+
|
63
|
+
if (checkArguments(\@ARGV) == 1) {
|
64
|
+
if ($HELP == 1) {
|
65
|
+
displayHelp();
|
66
|
+
exit 0;
|
67
|
+
}
|
68
|
+
}
|
69
|
+
else {
|
70
|
+
displayHelp();
|
71
|
+
exit -1;
|
72
|
+
}
|
73
|
+
|
74
|
+
# load nonbreaking prefixes and init both tokenizer and sentence splitter
|
75
|
+
%NONBREAKING_PREFIX = %{ &load_prefixes($LANGUAGE) };
|
76
|
+
|
77
|
+
&init_text_fixer($LANGUAGE, \%NONBREAKING_PREFIX);
|
78
|
+
&init_sentence_splitter($LANGUAGE, \%NONBREAKING_PREFIX);
|
79
|
+
&init_tokenizer($LANGUAGE, \%NONBREAKING_PREFIX);
|
80
|
+
|
81
|
+
|
82
|
+
# get timestamp
|
83
|
+
my $timestamp = "0000-00-00T00:00:00Z";
|
84
|
+
if ($NOTIMESTAMP == 0) {
|
85
|
+
$timestamp = timestamp();
|
86
|
+
}
|
87
|
+
|
88
|
+
# print kaf header
|
89
|
+
if ($FILE ne "") {
|
90
|
+
|
91
|
+
my $i = rindex($FILE, ".");
|
92
|
+
my $filename = substr($FILE, 0, $i);
|
93
|
+
my $filetype = uc(substr($FILE, $i+1, length($FILE)-length($filename)-1));
|
94
|
+
print_kafheader($filename, $filetype, $timestamp, $LANGUAGE);
|
95
|
+
}
|
96
|
+
else {
|
97
|
+
print_kafheader_nofile($timestamp);
|
98
|
+
}
|
99
|
+
print " <text>\n";
|
100
|
+
|
101
|
+
# process text
|
102
|
+
my $sent = 1;
|
103
|
+
my $para = 1;
|
104
|
+
my $counter = 0;
|
105
|
+
my $charcount = 0;
|
106
|
+
while(<STDIN>) {
|
107
|
+
|
108
|
+
if (/^<.+>$/ || /^\s*$/) {
|
109
|
+
#don't try to tokenize XML/HTML tag lines
|
110
|
+
chomp($_);
|
111
|
+
#print $_;
|
112
|
+
}
|
113
|
+
else {
|
114
|
+
|
115
|
+
#fix input text
|
116
|
+
my $text = &fix_text($_);
|
117
|
+
|
118
|
+
#split sentences
|
119
|
+
my @sentences = &split_sentences($text);
|
120
|
+
my $index = 0;
|
121
|
+
my $last_index = 0;
|
122
|
+
my $last_offset = -1;
|
123
|
+
my $j = 0;
|
124
|
+
foreach my $sentence (@sentences) {
|
125
|
+
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
|
126
|
+
#print &tokenize($_);
|
127
|
+
$tok = tokenize($sentence);
|
128
|
+
#tokenize some especial characters
|
129
|
+
$tok =~ s/([||
|||«|»])([^ ])/$1 $2/g;
|
130
|
+
$tok =~ s/([^ ])([||
|||«|»])/$1 $2/g;
|
131
|
+
#detokenize tokens with @
|
132
|
+
$tok =~ s/ @ /@/g;
|
133
|
+
#detokenize some tokens with '
|
134
|
+
$tok =~ s/([DLNO]) '/$1'/g;
|
135
|
+
#$tok =~ s/([DLNO])' /$1'/g; cambiado por Andoni Azpeitia Vicomtech L' armée => L'armée
|
136
|
+
$tok =~ s/o( )?'( )?clock/o'clock/g;
|
137
|
+
$tok =~ s/ ' ([0-9][0-9]s)/ '$1/g;
|
138
|
+
#detokenize some time formats
|
139
|
+
$tok =~ s/([0-9][0-9]*) ' ([0-9][0-9]*) "/$1'$2"/g;
|
140
|
+
$tok =~ s/([0-9][0-9]*) : ([0-9][0-9])/$1:$2/g;
|
141
|
+
#detokenize some height formats
|
142
|
+
$tok =~ s/([0-9][0-9]*) ' ([0-9][0-9])/$1'$2/g;
|
143
|
+
#tokenize two dashes
|
144
|
+
$tok =~ s/\-\-/ \-\-/g;
|
145
|
+
#correct ºC tokenization
|
146
|
+
$tok =~ s/([0-9])( )?º( )?C/$1 ºC/g;
|
147
|
+
$tok =~ s/ +/ /g;
|
148
|
+
#<<<<<<<<<<<<<<<
|
149
|
+
#changed by me (aitor) to format the output as a kind of dummy KAF format
|
150
|
+
chomp($tok);
|
151
|
+
|
152
|
+
@tokens = split(/ /, $tok);
|
153
|
+
|
154
|
+
my $i = 0;
|
155
|
+
foreach my $token (@tokens) {
|
156
|
+
|
157
|
+
$index = index($_, $token, $last_index);
|
158
|
+
|
159
|
+
#if token was substituted at tokenization, be careful
|
160
|
+
if ( $token eq "\"".$SUBSTITUTE ) {
|
161
|
+
$index = index($_, "'", $last_index);
|
162
|
+
$token = "\"";
|
163
|
+
}
|
164
|
+
elsif ( $token eq "\'".$SUBSTITUTE ) {
|
165
|
+
$index = index($_, "`", $last_index);
|
166
|
+
$token = "\'";
|
167
|
+
}
|
168
|
+
|
169
|
+
my $offset = $charcount + $index;
|
170
|
+
#if input text has been preprocesed and tokens has been moved ( 'hello.' => 'hello'.),
|
171
|
+
# offset of the "." char is at the left of "'" char not at the right
|
172
|
+
if ( $index==-1 ) {
|
173
|
+
$index = index($_, $token, $last_index-2);
|
174
|
+
$offset = $charcount + $index;
|
175
|
+
}
|
176
|
+
#make sure that found offset is not an offset of the same char at other position so,
|
177
|
+
# find offset of the next token and compare
|
178
|
+
# next token is at the same sentence
|
179
|
+
elsif ( $i<scalar(@tokens)-1 ) {
|
180
|
+
my $next_token_index = $charcount + index($_, $tokens[$i+1], length($token) + $last_index-1);
|
181
|
+
if ( $index==-1 || ($next_token_index > -1 && $offset > $next_token_index+1) ) {
|
182
|
+
$index = index($_, $token, $last_index-2);
|
183
|
+
$offset = $charcount + $index;
|
184
|
+
}
|
185
|
+
}
|
186
|
+
#make sure that found offset is not an offset of the same char at other position so,
|
187
|
+
# find offset of the next token and compare
|
188
|
+
# next token is at next sentence
|
189
|
+
elsif ( $sent < scalar(@sentences)) {
|
190
|
+
my $next_sentence = $sentences[$j+1];
|
191
|
+
$next_sentence = &tokenize($next_sentence);
|
192
|
+
my @next_tokens = split(/ /, $next_sentence);
|
193
|
+
my $next_token = $next_tokens[0];
|
194
|
+
my $next_token_index = $charcount + index($_, $next_token, length($token) + $last_index-1);
|
195
|
+
if ( ($next_token_index > -1 && $offset > $next_token_index+1) ) {
|
196
|
+
$index = index($_, $token, $last_index-2);
|
197
|
+
$offset = $charcount + $index;
|
198
|
+
}
|
199
|
+
}
|
200
|
+
|
201
|
+
my $token_length = length($token);
|
202
|
+
&print_line(++$counter, $sent, $para, $offset, $token_length, $token);
|
203
|
+
|
204
|
+
$last_index = $index + $token_length;
|
205
|
+
$last_offset = $offset;
|
206
|
+
$i++;
|
207
|
+
}
|
208
|
+
|
209
|
+
#>>>>>>>>>>>>>>>
|
210
|
+
$j++;
|
211
|
+
$sent++;
|
212
|
+
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
213
|
+
}#foreach sentence
|
214
|
+
|
215
|
+
if (length($_) == 0) {
|
216
|
+
$charcount += 1;
|
217
|
+
}
|
218
|
+
else {
|
219
|
+
$charcount += length($_);
|
220
|
+
}
|
221
|
+
$para++;
|
222
|
+
}
|
223
|
+
}#while(<STDIN>)
|
224
|
+
print " </text>\n";
|
225
|
+
print "</KAF>\n";
|
226
|
+
|
227
|
+
|
228
|
+
#prints word form in kaf format
|
229
|
+
sub print_line {
|
230
|
+
|
231
|
+
my $wid=shift(@_);
|
232
|
+
my $sent=shift(@_);
|
233
|
+
my $para=shift(@_);
|
234
|
+
my $offset=shift(@_);
|
235
|
+
my $length=shift(@_);
|
236
|
+
my $token=shift(@_);
|
237
|
+
|
238
|
+
print " <wf wid=\"w".$wid."\" sent=\"".$sent."\" para=\"".$para."\" offset=\"".$offset."\" length=\"". $length."\"><![CDATA[".$token."]]></wf>\n";
|
239
|
+
}
|
240
|
+
|
241
|
+
#prints kaf xml fomat header
|
242
|
+
sub print_kafheader {
|
243
|
+
my $filename = shift(@_);
|
244
|
+
my $filetype = shift(@_);
|
245
|
+
my $timestamp = shift(@_);
|
246
|
+
my $LANGUAGE = shift(@_);
|
247
|
+
print "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n";
|
248
|
+
print "<KAF xml:lang=\"".$LANGUAGE."\" version=\"v1.opener\">\n";
|
249
|
+
print " <kafHeader>\n";
|
250
|
+
print " <fileDesc filename=\"".$filename."\" filetype=\"".$filetype."\" />\n";
|
251
|
+
print " <linguisticProcessors layer=\"text\">\n";
|
252
|
+
print " <lp name=\"opener-sentence-splitter-$LANGUAGE\" version=\"".$SENT_VERSION."\" timestamp=\"".$timestamp."\"/>\n";
|
253
|
+
print " <lp name=\"opener-tokenizer-$LANGUAGE\" version=\"".$TOK_VERSION."\" timestamp=\"".$timestamp."\"/>\n";
|
254
|
+
print " </linguisticProcessors>\n";
|
255
|
+
print " </kafHeader>\n";
|
256
|
+
}
|
257
|
+
#prints kaf xml fomat header whithout filedesc
|
258
|
+
sub print_kafheader_nofile {
|
259
|
+
my $timestamp = shift(@_);
|
260
|
+
print "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n";
|
261
|
+
print "<KAF xml:lang=\"".$LANGUAGE."\" version=\"v1.opener\">\n";
|
262
|
+
print " <kafHeader>\n";
|
263
|
+
print " <fileDesc />\n";
|
264
|
+
print " <linguisticProcessors layer=\"text\">\n";
|
265
|
+
print " <lp name=\"opener-sentence-splitter-$LANGUAGE\" version=\"".$SENT_VERSION."\" timestamp=\"".$timestamp."\"/>\n";
|
266
|
+
print " <lp name=\"opener-tokenizer-$LANGUAGE\" version=\"".$TOK_VERSION."\" timestamp=\"".$timestamp."\"/>\n";
|
267
|
+
print " </linguisticProcessors>\n";
|
268
|
+
print " </kafHeader>\n";
|
269
|
+
}
|
270
|
+
|
271
|
+
sub checkArguments {
|
272
|
+
my $argref = shift(@_);
|
273
|
+
my @arg = @ { $argref };
|
274
|
+
my $correct = 1;
|
275
|
+
if (scalar(@arg) > 0) {
|
276
|
+
for (my $i = 0; $i < scalar(@arg); $i++) {
|
277
|
+
if (lc($arg[$i]) eq "-l") {
|
278
|
+
if(scalar(@arg) > $i+1) {
|
279
|
+
if (lc($arg[$i+1]) ne "-t" && lc($arg[$i+1]) ne "-f" && lc($arg[$i+1]) ne "-l" && checkLanguage($arg[$i+1]) == 1) {
|
280
|
+
$LANGUAGE = $arg[$i+1];
|
281
|
+
}
|
282
|
+
else {
|
283
|
+
$correct = 0;
|
284
|
+
print STDERR "Error: language \"".$arg[$i+1]."\" not supported\n";
|
285
|
+
}
|
286
|
+
}
|
287
|
+
else {
|
288
|
+
$correct = 0;
|
289
|
+
print STDERR "Error: language don't specified\n";
|
290
|
+
}
|
291
|
+
}
|
292
|
+
elsif (lc($arg[$i]) eq "-f") {
|
293
|
+
if(scalar(@arg) > $i+1 && lc($arg[$i+1]) ne "-t" && lc($arg[$i+1]) ne "-f" && lc($arg[$i+1]) ne "-l") {
|
294
|
+
$FILE = $arg[$i+1];
|
295
|
+
}
|
296
|
+
else {
|
297
|
+
$correct = 0;
|
298
|
+
print STDERR "Error: file's name empty\n";
|
299
|
+
}
|
300
|
+
}
|
301
|
+
elsif (lc($arg[$i]) eq "-t") {
|
302
|
+
$NOTIMESTAMP = 1;
|
303
|
+
}
|
304
|
+
elsif (lc($arg[$i]) eq "--help") {
|
305
|
+
$HELP = 1;
|
306
|
+
}
|
307
|
+
}
|
308
|
+
if ($LANGUAGE ne "") {
|
309
|
+
return $correct;
|
310
|
+
}
|
311
|
+
else {
|
312
|
+
print STDERR "Error: language don't specified\n";
|
313
|
+
return 0;
|
314
|
+
}
|
315
|
+
}
|
316
|
+
else {
|
317
|
+
print STDERR "Error: language don't specified\n";
|
318
|
+
return 0
|
319
|
+
}
|
320
|
+
}
|
321
|
+
|
322
|
+
sub checkLanguage {
|
323
|
+
my $language = shift(@_);
|
324
|
+
if ($language eq "en") { return 1; }
|
325
|
+
elsif ($language eq "es") { return 1; }
|
326
|
+
elsif ($language eq "fr") { return 1; }
|
327
|
+
elsif ($language eq "it") { return 1; }
|
328
|
+
elsif ($language eq "de") { return 1; }
|
329
|
+
elsif ($language eq "nl") { return 1; }
|
330
|
+
else { return -1 }
|
331
|
+
}
|
332
|
+
|
333
|
+
sub displayHelp {
|
334
|
+
print STDERR "\nThis aplication reads a text from standard input in order to tokenize.\n";
|
335
|
+
print STDERR "Application arguments:\n";
|
336
|
+
print STDERR "-l, --language input text's language.\n";
|
337
|
+
print STDERR "-f, --filename (optional) file's name.\n";
|
338
|
+
print STDERR "-t, (optional) o use static timestamp at KAF header.\n";
|
339
|
+
print STDERR "--help, outputs aplication help.\n";
|
340
|
+
}
|
341
|
+
|
342
|
+
sub timestamp {
|
343
|
+
my $time = Time::Stamp::gmstamp();
|
344
|
+
return $time;
|
345
|
+
}
|
346
|
+
|
347
|
+
sub detect_encoding {
|
348
|
+
my $file = shift(@_);
|
349
|
+
my $enc;
|
350
|
+
open(FILE,$file);
|
351
|
+
binmode(FILE);
|
352
|
+
if(read(FILE,my $filestart, 500)) {
|
353
|
+
$enc = guess_encoding($filestart);
|
354
|
+
}
|
355
|
+
close(FILE);
|
356
|
+
if (ref($enc)) {
|
357
|
+
return $enc->name;
|
358
|
+
}
|
359
|
+
else {
|
360
|
+
return "utf8";
|
361
|
+
}
|
362
|
+
}
|
363
|
+
|