biblicit 2.2.1 → 2.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +2 -2
- data/lib/biblicit/version.rb +1 -1
- data/parscit/bin/citeExtract.pl +4 -3
- data/parscit/bin/parsHed/redo.parsHed.pl +2 -1
- data/parscit/bin/phOutput2xml.pl +2 -1
- data/parscit/bin/redo.parsCit.pl +2 -1
- data/parscit/bin/sectLabel/genericSectExtract.rb +1 -1
- data/parscit/bin/tr2crfpp.pl +2 -1
- data/parscit/lib/HeaderParse/Config/API_Config.pm +1 -1
- data/parscit/lib/ParsCit/Tr2crfpp.pm +2 -2
- data/parscit/lib/ParsHed/Tr2crfpp.pm +3 -3
- data/parscit/lib/ParsHed/Tr2crfpp_token.pm +2 -2
- data/parscit/lib/SectLabel/AAMatching.pm +2 -1
- data/parscit/lib/SectLabel/Controller.pm +4 -2
- data/parscit/lib/SectLabel/Tr2crfpp.pm +3 -2
- metadata +4 -5
- data/parscit/doc/index.html +0 -692
data/README.md
CHANGED
@@ -13,7 +13,7 @@ Note: The version is 2.x, but really should be 0.2.x.
|
|
13
13
|
result = Biblicit::Extractor.extract(content: "a string containing the content of a PDF file")
|
14
14
|
|
15
15
|
# Extract metadata from a file using all available tools
|
16
|
-
result = Biblicit::Extractor.extract(file: "myfile.pdf", tools: [:
|
16
|
+
result = Biblicit::Extractor.extract(file: "myfile.pdf", tools: [:parshed, :cb2bib], remote: true, token: false)
|
17
17
|
|
18
18
|
# See reference information for "myfile.pdf"
|
19
19
|
result[:citeseer][:title]
|
@@ -139,7 +139,7 @@ You can specify where you have installed CRF++ by setting the CRFPP_HOME environ
|
|
139
139
|
|
140
140
|
sudo apt-add-repository 'deb http://cl.naist.jp/~eric-n/ubuntu-nlp oneiric all'
|
141
141
|
sudo apt-get update
|
142
|
-
sudo apt-get install libcrf
|
142
|
+
sudo apt-get install libcrf++-dev crf++
|
143
143
|
|
144
144
|
##### On OS X with Homebrew
|
145
145
|
|
data/lib/biblicit/version.rb
CHANGED
data/parscit/bin/citeExtract.pl
CHANGED
@@ -53,8 +53,8 @@ $tmpfile .= $$ . time;
|
|
53
53
|
|
54
54
|
# Untaint tmpfile variable
|
55
55
|
if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; }
|
56
|
-
|
57
|
-
$tmpfile = "
|
56
|
+
my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
|
57
|
+
$tmpfile = "$tmpdir/$tmpfile";
|
58
58
|
$0 =~ /([^\/]+)$/;
|
59
59
|
my $progname = $1;
|
60
60
|
|
@@ -379,7 +379,8 @@ sub BiblioScript
|
|
379
379
|
my ($types, $pc_xml, $outfile) = @_;
|
380
380
|
|
381
381
|
my @export_types = @{ $types };
|
382
|
-
|
382
|
+
my $base_tmp_dir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
|
383
|
+
my $tmp_dir = $base_tmp_dir . "/" . NewTmpFile();
|
383
384
|
system("mkdir -p $tmp_dir");
|
384
385
|
|
385
386
|
# Write extract_citation output to a tmp file
|
@@ -10,7 +10,8 @@ use FindBin;
|
|
10
10
|
my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g;
|
11
11
|
$tmpfile .= $$ . time;
|
12
12
|
if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; } # untaint tmpfile variable
|
13
|
-
$
|
13
|
+
my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
|
14
|
+
$tmpfile = "$tmpdir/$tmpfile";
|
14
15
|
$0 =~ /([^\/]+)$/; my $progname = $1;
|
15
16
|
my $outputVersion = "1.0";
|
16
17
|
|
data/parscit/bin/phOutput2xml.pl
CHANGED
@@ -29,7 +29,8 @@ use strict 'vars';
|
|
29
29
|
my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g;
|
30
30
|
$tmpfile .= $$ . time;
|
31
31
|
if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; } # untaint tmpfile variable
|
32
|
-
$
|
32
|
+
my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
|
33
|
+
$tmpfile = "$tmpdir/$tmpfile";
|
33
34
|
$0 =~ /([^\/]+)$/; my $progname = $1;
|
34
35
|
my $outputVersion = "1.0";
|
35
36
|
### END user customizable section
|
data/parscit/bin/redo.parsCit.pl
CHANGED
@@ -5,7 +5,8 @@
|
|
5
5
|
my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g;
|
6
6
|
$tmpfile .= $$ . time;
|
7
7
|
if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; } # untaint tmpfile variable
|
8
|
-
$
|
8
|
+
my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
|
9
|
+
$tmpfile = "$tmpdir/$tmpfile";
|
9
10
|
$0 =~ /([^\/]+)$/; my $progname = $1;
|
10
11
|
my $outputVersion = "1.0";
|
11
12
|
my $parscitHome = "/home/wing.nus/services/parscit/tools/";
|
@@ -8,7 +8,7 @@ pwd = File.dirname(__FILE__)
|
|
8
8
|
@CRFPP = ENV['CRFPP_HOME'] ? "#{ENV['CRFPP_HOME']}/bin" : "#{pwd}/../../crfpp"
|
9
9
|
@SRC = "#{pwd}/genericSect"
|
10
10
|
@DATA = "#{pwd}/../../resources/sectLabel/"
|
11
|
-
@TEST_DIR = "/tmp
|
11
|
+
@TEST_DIR = ENV['PARSCIT_TMPDIR'] || "/tmp"
|
12
12
|
|
13
13
|
require "#{@SRC}/forceUtf8"
|
14
14
|
|
data/parscit/bin/tr2crfpp.pl
CHANGED
@@ -29,7 +29,8 @@ use FindBin;
|
|
29
29
|
my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g;
|
30
30
|
$tmpfile .= $$ . time;
|
31
31
|
if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; } # untaint tmpfile variable
|
32
|
-
$
|
32
|
+
my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
|
33
|
+
$tmpfile = "$tmpdir/$tmpfile";
|
33
34
|
$0 =~ /([^\/]+)$/; my $progname = $1;
|
34
35
|
my $outputVersion = "1.0";
|
35
36
|
my $dictFile = "$FindBin::Bin/../resources/parsCitDict.txt";
|
@@ -33,7 +33,7 @@ $Resource_Dir = "$parscitHome/resources/headerParse";
|
|
33
33
|
$Database_Dir = "$Resource_Dir/database/";
|
34
34
|
$Data_Dir = "$Resource_Dir/data/";
|
35
35
|
$offlineD = "$Resource_Dir/models/";
|
36
|
-
$Tmp_Dir = "
|
36
|
+
$Tmp_Dir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
|
37
37
|
|
38
38
|
$nMinHeaderLength = 50;
|
39
39
|
$nMaxHeaderLength = 2500;
|
@@ -329,9 +329,9 @@ sub buildTmpFile {
|
|
329
329
|
if ($tmpfile =~ /^([-\@\w.]+)$/) {
|
330
330
|
$tmpfile = $1;
|
331
331
|
}
|
332
|
-
|
333
|
-
|
334
|
-
return "
|
332
|
+
|
333
|
+
my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
|
334
|
+
return "$tmpdir/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
|
335
335
|
|
336
336
|
} # buildTmpFile
|
337
337
|
|
@@ -216,8 +216,8 @@ sub buildTmpFile {
|
|
216
216
|
$tmpfile = $1;
|
217
217
|
}
|
218
218
|
|
219
|
-
|
220
|
-
return "
|
219
|
+
my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
|
220
|
+
return "$tmpdir/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
|
221
221
|
|
222
222
|
} # buildTmpFile
|
223
223
|
|
@@ -1930,7 +1930,8 @@ sub BuildTmpFile
|
|
1930
1930
|
$tmpfile = $1;
|
1931
1931
|
}
|
1932
1932
|
|
1933
|
-
|
1933
|
+
my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
|
1934
|
+
return "$tmpdir/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
|
1934
1935
|
}
|
1935
1936
|
|
1936
1937
|
1;
|
@@ -212,7 +212,7 @@ sub GetGenericHeaders
|
|
212
212
|
my $num_headers = scalar(@{ $headers });
|
213
213
|
|
214
214
|
# Put the list of headers to file
|
215
|
-
|
215
|
+
my $header_file = NewTmpFile();
|
216
216
|
|
217
217
|
$generic_sect_path = UntaintPath($generic_sect_path);
|
218
218
|
|
@@ -328,7 +328,9 @@ sub NewTmpFile
|
|
328
328
|
|
329
329
|
chomp($tmpfile);
|
330
330
|
$tmpfile = UntaintPath($tmpfile);
|
331
|
-
|
331
|
+
|
332
|
+
my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
|
333
|
+
return "$tmpdir/$tmpfile";
|
332
334
|
}
|
333
335
|
|
334
336
|
1;
|
@@ -1079,8 +1079,9 @@ sub BuildTmpFile
|
|
1079
1079
|
{
|
1080
1080
|
$tmpfile = $1;
|
1081
1081
|
}
|
1082
|
-
|
1083
|
-
|
1082
|
+
|
1083
|
+
my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
|
1084
|
+
return "$tmpdir/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
|
1084
1085
|
}
|
1085
1086
|
|
1086
1087
|
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: biblicit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 2.2.
|
5
|
+
version: 2.2.2
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- David Judd
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-05-
|
12
|
+
date: 2013-05-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
prerelease: false
|
@@ -153,7 +153,6 @@ files:
|
|
153
153
|
- parscit/bin/sectLabel/single2multi.pl
|
154
154
|
- parscit/bin/sectLabel/tr2crfpp.pl
|
155
155
|
- parscit/bin/tr2crfpp.pl
|
156
|
-
- parscit/doc/index.html
|
157
156
|
- parscit/lib/CSXUtil/SafeText.pm
|
158
157
|
- parscit/lib/HeaderParse/API/AssembleXMLMetadata.pm
|
159
158
|
- parscit/lib/HeaderParse/API/Function.pm
|
@@ -435,7 +434,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
435
434
|
- !ruby/object:Gem::Version
|
436
435
|
segments:
|
437
436
|
- 0
|
438
|
-
hash:
|
437
|
+
hash: -1109935214577805230
|
439
438
|
version: '0'
|
440
439
|
none: false
|
441
440
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
@@ -444,7 +443,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
444
443
|
- !ruby/object:Gem::Version
|
445
444
|
segments:
|
446
445
|
- 0
|
447
|
-
hash:
|
446
|
+
hash: -1109935214577805230
|
448
447
|
version: '0'
|
449
448
|
none: false
|
450
449
|
requirements:
|
data/parscit/doc/index.html
DELETED
@@ -1,692 +0,0 @@
|
|
1
|
-
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 3.0//EN">
|
2
|
-
|
3
|
-
<HTML><HEAD>
|
4
|
-
<META HTTP-EQUIV="Keywords" NAME="Keywords" CONTENT="citations, citation parser, references, reference parser, bibliography parser, reference string parser, bibtex, citation, citation extraction, logical structure, document logical structure">
|
5
|
-
<LINK REL="stylesheet" HREF="parsCit.css"><TITLE>ParsCit: An open-source CRF Reference String and Logical Document Structure Parsing Package</TITLE>
|
6
|
-
<script type="text/javascript">
|
7
|
-
function toggleLayer( whichLayer ) {
|
8
|
-
var elem, vis;
|
9
|
-
if( document.getElementById ) // this is the way the standards work
|
10
|
-
elem = document.getElementById( whichLayer );
|
11
|
-
else if( document.all ) // this is the way old msie versions work
|
12
|
-
elem = document.all[whichLayer];
|
13
|
-
else if( document.layers ) // this is the way nn4 works
|
14
|
-
elem = document.layers[whichLayer];
|
15
|
-
vis = elem.style;
|
16
|
-
// if the style.display value is blank we try to figure it out here
|
17
|
-
if(vis.display==''&&elem.offsetWidth!=undefined&&elem.offsetHeight!=undefined)
|
18
|
-
vis.display = (elem.offsetWidth!=0&&elem.offsetHeight!=0)?'block':'none';
|
19
|
-
vis.display = (vis.display==''||vis.display=='block')?'none':'block';
|
20
|
-
}
|
21
|
-
</script>
|
22
|
-
</HEAD><BODY BGCOLOR="#FFFFFF">
|
23
|
-
|
24
|
-
<div id="leftcontent">
|
25
|
-
[ <A href="http://wing.comp.nus.edu.sg/">WING homepage</A> ]<br/>
|
26
|
-
[ <A href="http://wing.comp.nus.edu.sg/portal/web-services.html">WING web services</A> ]<br/>
|
27
|
-
<BR/>
|
28
|
-
[ <A href="#d">Download</a> ]<br/>
|
29
|
-
[ <A href="#ws">Web Service</a> ]</br/>
|
30
|
-
[ <A href="#wd">Web Demo</a> ]<br/>
|
31
|
-
[ <A href="#p">Publications</a> ]<br/>
|
32
|
-
[ <A href="#gsiso">Input and Output</a> ]<br/>
|
33
|
-
[ <A href="#gm">Group Members</a> ]<br/>
|
34
|
-
[ <A href="#faq">FAQ</a> ]<br/>
|
35
|
-
[ <A href="#t">Troubleshooting</a> ]<br/>
|
36
|
-
<br/>
|
37
|
-
<script type="text/javascript"
|
38
|
-
src="http://feedjit.com/serve/?vv=932&tft=3&dd=0&wid=0804fba767d140cd&pid=0&proid=0&bc=FFFFFF&tc=000000&brd1=012B6B&lnk=135D9E&hc=FFFFFF&hfc=2853A8&btn=C99700&ww=200&wne=10&wh=WING+Live+Traffic+Feed&hl=0&hlnks=0&hfce=0&srefs=1&hbars=0"></script><noscript><a
|
39
|
-
href="http://feedjit.com/">Feedjit Live Blog Stats</a></noscript>
|
40
|
-
</div>
|
41
|
-
|
42
|
-
<div id="centercontent">
|
43
|
-
<IMG ALIGN="LEFT" SRC="parsCit.png" WIDTH="200px" ALT="Picture of ParsCit Swami">
|
44
|
-
|
45
|
-
<CENTER><H1>ParsCit: An open-source CRF Reference String and Logical Document Structure Parsing Package</H1></CENTER>
|
46
|
-
|
47
|
-
<P>This is the home page of the ParsCit project, which performs two
|
48
|
-
tasks: 1) reference string parsing, sometimes also called citation
|
49
|
-
parsing or citation extraction, and 2) logical structure parsing of
|
50
|
-
scienfific documents. It is architected as a supervised machine
|
51
|
-
learning procedure that uses Conditional Random Fields as its learning
|
52
|
-
mechanism. You can download the code below, parse strings online, or
|
53
|
-
send batch jobs to our web service. The code contains both the
|
54
|
-
training data, feature generator and shell scripts to connect the
|
55
|
-
system to a web service (used on this web site).</P>
|
56
|
-
|
57
|
-
<P>Some definitions (thanks to Robert Dale for Citations and Reference
|
58
|
-
Strings):
|
59
|
-
|
60
|
-
<DL>
|
61
|
-
|
62
|
-
<DT>Reference String:</DT><DD>A text string in the bibliography or
|
63
|
-
reference section of a work, usually at the end of the document that
|
64
|
-
refers to a unique document. Usually occurs with other reference
|
65
|
-
strings that point to other documents. May also appear as
|
66
|
-
footnotes.</DD>
|
67
|
-
|
68
|
-
<DT>Citation:</DT><DD>A text string (usually explicit) in the
|
69
|
-
document body that points to a corresponding reference string at the
|
70
|
-
end of the document. Several citations may co-refer to a single
|
71
|
-
reference string.</DD>
|
72
|
-
|
73
|
-
<DT>Document Logical Structure:</DT><DD> A hierarchy of logical
|
74
|
-
components, for example, titles, authors, affiliations, abstracts,
|
75
|
-
sections, etc., according to (Mao, Rosenfeld &
|
76
|
-
Kanungo,2003). Our logical structure is more comprehensive,
|
77
|
-
comprising not only header metadata and references, but also the
|
78
|
-
logical structure of the internals of the document -- sections,
|
79
|
-
subsections, figures, tables, equations, footnotes and
|
80
|
-
captions. </DD>
|
81
|
-
|
82
|
-
</DL>
|
83
|
-
|
84
|
-
<P>This project deals with the problem of parsing the reference
|
85
|
-
strings and parsing the logical structure of a document. The first
|
86
|
-
task is handled by a module with the project namesake, ParsCit, and
|
87
|
-
the second task by a separate module SectLabel.
|
88
|
-
</P>
|
89
|
-
|
90
|
-
<br clear="all"/>
|
91
|
-
<!-- License ---------------------------------------------------------------------- -->
|
92
|
-
<A NAME="l"></A><H2>License</H2>
|
93
|
-
|
94
|
-
<P>This software is licensed under the <A
|
95
|
-
HREF="http://www.gnu.org/copyleft/lesser.html">Lesser GNU Public
|
96
|
-
License</A> (LGPL), which means you are free to use it for any
|
97
|
-
purpose, including embedding in commercial products. </P>
|
98
|
-
|
99
|
-
<br clear="all" />
|
100
|
-
<!-- Download ---------------------------------------------------------------------- -->
|
101
|
-
<A NAME="d"></A><H2>Download</H2>
|
102
|
-
|
103
|
-
<P>You can download the open-source code for ParsCit here. The source requires you to re-compile the CRFPP source code
|
104
|
-
and assumes that perl is installed on your system and can be invoked
|
105
|
-
using <CODE>perl</CODE> (must be in your path).
|
106
|
-
</P>
|
107
|
-
|
108
|
-
<ul>
|
109
|
-
|
110
|
-
<li> Current version <A HREF="parscit-110505b.zip">110505b</A>: Added XML::Twig for XML processing. ParsCit now uses input provided by SectLabel. See <A HREF="CHANGELOG.txt"> CHANGELOG.txt </A>.<BR/>
|
111
|
-
The (partially ported) <A HREF="parscit-110505b-win.zip">Windows</A> version is here (provided by Yumichika). See the <A HREF="CHANGES%20FOR%20WINDOWS.txt">CHANGES FOR WINDOWS.txt</A>
|
112
|
-
<BR/>
|
113
|
-
<BR/>
|
114
|
-
We have also pushed a copy of the ParsCit current distribution into <A HREF="http://www.github.com/knmnyn/parscit">GitHub:knmnyn/parscit</A>.
|
115
|
-
The Windows version has also been pushed to <A HREF="http://www.github.com/wing-nus/parscit">GitHub:wing-nus/parscit</A>.
|
116
|
-
|
117
|
-
While we'll strive to keep the GitHub version as updated as possible, the versions on this page will remain the most authoritative for major updates.
|
118
|
-
<BR/>
|
119
|
-
<li> Other versions: <BR/>
|
120
|
-
<A HREF="parscit-101101.zip">101101</A>: Incorporated <A HREF="http://github.com/mromanello/BiblioScript">BiblioScript</A> and <A HREF=http://www.scripps.edu/~cdputnam/software/bibutils>BibUtils</A> software. See CHANGELOG.txt; <BR/>
|
121
|
-
<A HREF="parscit-100401.zip">100401d</A>: Added SectLabel (logical structure parsing) software from the NUS team, and Iconip training data from Cheong Chi Hong for ParsCit with new ParsCit model retrained. See CHANGELOG.txt; <BR/>
|
122
|
-
<A HREF="parscit-090625.zip">090625b</A>: Added documentation for complete re-installation. Improved ParsHed with added line-level CRF model together and post-processing module by NUS team, WSDL file and client for service at NUS and minor bug fixes for ParsCit. See CHANGELOG.txt; <BR/>
|
123
|
-
<A HREF="parscit-090316.zip">090316</A>: Incorporation of ParsHed (header parsing) software from the NUS team. See CHANGELOG.txt; <BR/>
|
124
|
-
<A HREF="parscit-081201.zip">081201</A>: Bug fixes and incorporation of byte position offset from the Scienstein.org team. See CHANGELOG.txt; <BR/>
|
125
|
-
<A HREF="parscit-080917.zip">080917</A>: Minor changes (improved models and mulilingual support), see CHANGELOG.txt; <BR/>
|
126
|
-
<A HREF="parscit-080402.zip">080402</A>: First public release. Comes with precompiled linux binaries for CRF++; <BR/>
|
127
|
-
<A HREF="parscit-080310.tgz">080310</A>: Beta release.
|
128
|
-
|
129
|
-
<li><A HREF="http://crfpp.sourceforge.net">CRF++</A>: A conditional random fields toolkit that you may need to install, if the compiled one does not work for you. We recommend that you use version 0.51. </ul>
|
130
|
-
|
131
|
-
<!-- Web Service ---------------------------------------------------------------------- -->
|
132
|
-
<A name="ws"></a><H2>Web Service</h2>
|
133
|
-
|
134
|
-
<P>More NLP services are now being made available on the web.
|
135
|
-
Following this trend you can send your plain text citations to use via
|
136
|
-
our web service. We will parse these for you free of charge (as and
|
137
|
-
when time and processing power allows, these processes are done with
|
138
|
-
lower priority).</P>
|
139
|
-
|
140
|
-
<P CLASS="red">N.B. We keep logs of what's parsed in these demos, to
|
141
|
-
improve the accuracy and productivity of ParsCit. If you'd like these
|
142
|
-
to be kept private or you find you use this service a lot, why not
|
143
|
-
install a local copy of ParsCit for yourself? If you do, please
|
144
|
-
let us know where you are so we acknowledge you here and can re-direct
|
145
|
-
some traffic your way.
|
146
|
-
</P>
|
147
|
-
|
148
|
-
<UL>
|
149
|
-
<LI> <A HREF="wing.nus.wsdl">Download the WSDL file</A> for the service at NUS.
|
150
|
-
<LI> <A HREF="ParsCitClientWSDL.rb">Download the sample ruby client
|
151
|
-
that uses the WSDL file</A> to dynamically generate the ParsCit web
|
152
|
-
service call to the NUS server. Edit the file to see how to
|
153
|
-
execute it.
|
154
|
-
<LI> <A HREF="ParsCitClient.rb">Download sample ruby client code</a>
|
155
|
-
for the ParsCit web service at the NUS server. To execute,
|
156
|
-
just point it at a local
|
157
|
-
text file that represents the text dump of a scholarly article
|
158
|
-
(such as one produced by a PDF to text converter):
|
159
|
-
<CODE>
|
160
|
-
./ParsCitClient.rb ~/public_html/samples/E06-1050.txt
|
161
|
-
</CODE>
|
162
|
-
<LI><FORM METHOD="post" ACTION="parsCit.cgi"><INPUT TYPE="HIDDEN"
|
163
|
-
NAME="ping" VALUE="ping"><INPUT TYPE="SUBMIT" VALUE="Check"> whether
|
164
|
-
the web service is up.
|
165
|
-
</FORM>
|
166
|
-
</UL>
|
167
|
-
|
168
|
-
<!-- Web demo ----------------------------------------------------------------------- -->
|
169
|
-
<A name="wd"></a><H2>Web-based Demonstration</H2>
|
170
|
-
|
171
|
-
<P CLASS="red">N.B.: We keep logs of what's parsed in these demos, to
|
172
|
-
improve the accuracy and productivity of ParsCit. If you'd like these
|
173
|
-
to be kept private, why not install a local copy of ParsCit for
|
174
|
-
yourself?</P>
|
175
|
-
|
176
|
-
<P>You can also run ParsCit directly in your browser. The form below
|
177
|
-
submits your text input (after suitable cleaning) to the ParsCit
|
178
|
-
service to parse the input file or strings. <FONT COLOR="red">
|
179
|
-
Note that if system loads gets high, your demo call may not be executed. If you want to run this program in batch, please download your own copy.</FONT>
|
180
|
-
</P>
|
181
|
-
|
182
|
-
<P><B>Demo #1: Parsing the header, logical structure and/or reference strings (and citation contexts) from a text file</B></P>
|
183
|
-
|
184
|
-
<DIV STYLE="background-color:D0D0FF; padding: 1em">
|
185
|
-
<FORM ENCTYPE="multipart/form-data" METHOD="post" ACTION="parsCit.cgi">
|
186
|
-
<P>NB - this demo does not handle PDF input at this time. You can use another web service or software to convert PDFs to text. </P>
|
187
|
-
<P style="font-size:small;"><I>Internal key (if applicable):</I> <INPUT TYPE="password" SIZE="4" NAME="key"></P>
|
188
|
-
<INPUT TYPE="text" SIZE="80" NAME="demo" value="1" style="display:none;">
|
189
|
-
<P>Input Method 1) Enter a URL to a file on the web (e.g., <A HREF="http://wing.comp.nus.edu.sg/~wing.nus/samples/E06-1050.txt">http://wing.comp.nus.edu.sg/~wing.nus/samples/E06-1050.txt</A> or <A HREF="http://wing.comp.nus.edu.sg/~wing.nus/samples/W06-0102.txt">http://wing.comp.nus.edu.sg/~wing.nus/samples/W06-0102.txt</A>).<BR/>
|
190
|
-
<INPUT TYPE="text" SIZE="80" NAME="urlfile">
|
191
|
-
</P>
|
192
|
-
|
193
|
-
<P>Input Method 2) Upload a .txt file (ASCII; UTF-8)<BR/>
|
194
|
-
<INPUT TYPE="FILE" NAME="datafile">
|
195
|
-
</P>
|
196
|
-
|
197
|
-
<P>Input Method 3) Paste the whole file here:
|
198
|
-
<br/>
|
199
|
-
<TEXTAREA ROWS="4" COLS="80" NAME="textfile">
|
200
|
-
</TEXTAREA>
|
201
|
-
</P>
|
202
|
-
<P>Parse the document using the following options
|
203
|
-
<SELECT NAME="ParsCitOptions">
|
204
|
-
<OPTION SELECTED VALUE="5">all</OPTION>
|
205
|
-
<OPTION VALUE="1">citations</OPTION>
|
206
|
-
<OPTION VALUE="2">header</OPTION>
|
207
|
-
<OPTION VALUE="4">section</OPTION>
|
208
|
-
</SELECT>
|
209
|
-
</P>
|
210
|
-
|
211
|
-
<P>Citation export formats
|
212
|
-
<INPUT TYPE=CHECKBOX NAME="ads1">ADS
|
213
|
-
<INPUT TYPE=CHECKBOX NAME="bib1" CHECKED>BIB
|
214
|
-
<INPUT TYPE=CHECKBOX NAME="end1">EndNote
|
215
|
-
<INPUT TYPE=CHECKBOX NAME="isi1">ISI
|
216
|
-
<INPUT TYPE=CHECKBOX NAME="ris1">RIS
|
217
|
-
<INPUT TYPE=CHECKBOX NAME="wordbib1">WordBib
|
218
|
-
</P>
|
219
|
-
|
220
|
-
|
221
|
-
<br/><CENTER><INPUT TYPE="SUBMIT" VALUE="Parse this file!"></CENTER>
|
222
|
-
</FORM>
|
223
|
-
</DIV>
|
224
|
-
|
225
|
-
<P><B>Demo #2: As above but using XML input (XML must conform to Omnipage output). This demo is slow so please be patient.</B></P>
|
226
|
-
<DIV STYLE="background-color:D0D0FF; padding: 1em">
|
227
|
-
<FORM ENCTYPE="multipart/form-data" METHOD="post" ACTION="parsCit.cgi">
|
228
|
-
<INPUT TYPE="text" SIZE="80" NAME="demo" value="2" style="display:none;">
|
229
|
-
<P style="font-size:small;"><I>Internal key (if applicable):</I> <INPUT TYPE="password" SIZE="4" NAME="key"></P>
|
230
|
-
<P>Input Method 1) Enter a URL to a file on the web (e.g., <A HREF="http://wing.comp.nus.edu.sg/~wing.nus/samples/E06-1050.xml">http://wing.comp.nus.edu.sg/~wing.nus/samples/E06-1050.xml</A> or <A HREF="http://wing.comp.nus.edu.sg/~wing.nus/samples/W06-0102.xml">http://wing.comp.nus.edu.sg/~wing.nus/samples/W06-0102.xml</A>).<BR/>
|
231
|
-
<INPUT TYPE="text" SIZE="80" NAME="urlfile">
|
232
|
-
</P>
|
233
|
-
|
234
|
-
<P>Input Method 2) Upload a .xml file (ASCII; UTF-8)<BR/>
|
235
|
-
<INPUT TYPE="FILE" NAME="datafile">
|
236
|
-
</P>
|
237
|
-
|
238
|
-
<P>Input Method 3) Paste the whole .xml file here:
|
239
|
-
<br/>
|
240
|
-
<TEXTAREA ROWS="4" COLS="80" NAME="textfile">
|
241
|
-
</TEXTAREA>
|
242
|
-
</P>
|
243
|
-
|
244
|
-
<P>Input Method 4) Upload your own .pdf file (less than 50 pages & smaller than 10MB):
|
245
|
-
<br/>
|
246
|
-
<INPUT TYPE="FILE" NAME="pdffile">
|
247
|
-
</P>
|
248
|
-
|
249
|
-
<P>Parse the document using the following options
|
250
|
-
<SELECT NAME="ParsCitOptions">
|
251
|
-
<OPTION SELECTED VALUE="5">all</OPTION>
|
252
|
-
<OPTION VALUE="1">citations</OPTION>
|
253
|
-
<OPTION VALUE="2">header</OPTION>
|
254
|
-
<OPTION VALUE="4">section</OPTION>
|
255
|
-
</SELECT>
|
256
|
-
</P>
|
257
|
-
<P>Citation export formats
|
258
|
-
<INPUT TYPE=CHECKBOX NAME="ads2">ADS
|
259
|
-
<INPUT TYPE=CHECKBOX NAME="bib2" CHECKED>BIB
|
260
|
-
<INPUT TYPE=CHECKBOX NAME="end2">EndNote
|
261
|
-
<INPUT TYPE=CHECKBOX NAME="isi2">ISI
|
262
|
-
<INPUT TYPE=CHECKBOX NAME="ris2">RIS
|
263
|
-
<INPUT TYPE=CHECKBOX NAME="wordbib2">WordBib
|
264
|
-
</P>
|
265
|
-
|
266
|
-
<br/><CENTER><INPUT TYPE="SUBMIT" VALUE="Parse this file!"></CENTER>
|
267
|
-
</FORM>
|
268
|
-
</DIV>
|
269
|
-
|
270
|
-
<!--
|
271
|
-
<P><B>Demo #2b: OCR a PDF file using Omnipage (less than 50 pages & smaller than 10MB).</B></P>
|
272
|
-
<DIV STYLE="background-color:D0D0FF; padding: 1em">
|
273
|
-
<FORM ENCTYPE="multipart/form-data" METHOD="post" ACTION="upload.cgi">
|
274
|
-
<P style="font-size:small;"><I>Internal key (if applicable):</I> <INPUT TYPE="password" SIZE="4" NAME="key"></P>
|
275
|
-
<P>File to OCR (PDF only): <INPUT TYPE="FILE" NAME="content"></P>
|
276
|
-
<br/><CENTER><INPUT TYPE="SUBMIT" VALUE="OCR this file!"></CENTER>
|
277
|
-
</FORM>
|
278
|
-
</DIV>
|
279
|
-
-->
|
280
|
-
|
281
|
-
<P><B>Demo #3: Parsing individual reference strings only (just <CODE>extract_citations</CODE>)</B></P>
|
282
|
-
<DIV STYLE="background-color:D0D0FF; padding: 1em">
|
283
|
-
<FORM ENCTYPE="multipart/form-data" METHOD="post" ACTION="parsCit.cgi">
|
284
|
-
<INPUT TYPE="text" SIZE="80" NAME="demo" value="3" style="display:none;">
|
285
|
-
<P style="font-size:small;"><I>Internal key (if applicable):</I> <INPUT TYPE="password" SIZE="4" NAME="key"></P>
|
286
|
-
<P>Input Method 1) Enter a URL to a file on the web in the correct format (each line should be a separate citation; e.g., <A
|
287
|
-
HREF="http://wing.comp.nus.edu.sg/~wing.nus/samples/E06-1050.cite">http://wing.comp.nus.edu.sg/~wing.nus/samples/E06-1050.cite</A> or <A
|
288
|
-
HREF="http://wing.comp.nus.edu.sg/~wing.nus/samples/W06-0102.cite">http://wing.comp.nus.edu.sg/~wing.nus/samples/W06-0102.cite</A>).
|
289
|
-
<INPUT TYPE="text" SIZE="80" NAME="urllines">
|
290
|
-
</P>
|
291
|
-
|
292
|
-
<P>Input Method 2) Upload a file (again, each line should be a separate citation)<BR/>
|
293
|
-
<INPUT TYPE="FILE" NAME="datalines">
|
294
|
-
</P>
|
295
|
-
|
296
|
-
<P>Input Method 3) Enter a list of plain text citations (again, one per line):<BR/>
|
297
|
-
<TEXTAREA ROWS="4" COLS="80" NAME="textlines">Isaac G. Councill, C. Lee Giles, Min-Yen Kan. (2008) ParsCit: An open-source CRF reference string parsing package. To appear in the proceedings of the Language Resources and Evaluation Conference (LREC 08), Marrakesh, Morrocco, May.
|
298
|
-
</TEXTAREA>
|
299
|
-
</P>
|
300
|
-
|
301
|
-
<P>Citation export formats
|
302
|
-
<INPUT TYPE=CHECKBOX NAME="ads3">ADS
|
303
|
-
<INPUT TYPE=CHECKBOX NAME="bib3" CHECKED>BIB
|
304
|
-
<INPUT TYPE=CHECKBOX NAME="end3">EndNote
|
305
|
-
<INPUT TYPE=CHECKBOX NAME="isi3">ISI
|
306
|
-
<INPUT TYPE=CHECKBOX NAME="ris3">RIS
|
307
|
-
<INPUT TYPE=CHECKBOX NAME="wordbib3">WordBib
|
308
|
-
</P>
|
309
|
-
|
310
|
-
<br/><CENTER><INPUT TYPE="SUBMIT" VALUE="Parse these lines!"></CENTER>
|
311
|
-
</FORM>
|
312
|
-
</DIV>
|
313
|
-
|
314
|
-
<!-- Publications ---------------------------------------------------------------------- -->
|
315
|
-
<A name="p"></a><H2>Publications</H2>
|
316
|
-
<P><B>Journal Papers:</B>
|
317
|
-
<UL>
|
318
|
-
<LI> Minh-Thang Luong, Thuy Dung Nguyen and Min-Yen Kan (forthcoming)
|
319
|
-
<U>Logical Structure Recovery in Scholarly Articles with Rich
|
320
|
-
Document Features</U>. Forthcoming in the International
|
321
|
-
Journal of Digital Library Systems. <BR/>
|
322
|
-
[ <A HREF="ijdls-SectLabel.pdf">pre-print .pdf</A> ]
|
323
|
-
</UL>
|
324
|
-
|
325
|
-
<P><B>International Referreed Conference Publications:</B>
|
326
|
-
<UL>
|
327
|
-
<LI> Isaac G. Councill, C. Lee Giles, Min-Yen Kan. (2008)
|
328
|
-
<U>ParsCit: An open-source CRF reference string parsing
|
329
|
-
package</U>. In Proceedings of the Language Resources and
|
330
|
-
Evaluation Conference (LREC 08), Marrakesh, Morrocco, May.
|
331
|
-
<BR/> [ <A HREF="lrec08/lrec08.pdf">.pdf</A> ]
|
332
|
-
[ <A HREF="lrec08b.png">Poster (.png)</A> ]
|
333
|
-
</UL>
|
334
|
-
|
335
|
-
<P><B>Others:</B>
|
336
|
-
<UL>
|
337
|
-
<LI> Yong Kiat Ng. (2004) <U>Citation Parsing Using Maximum Entropy
|
338
|
-
and Repairs</U>. Undergraduate thesis. National University of
|
339
|
-
Singapore. <BR/>
|
340
|
-
[ <A HREF="yongKiatNgThesis.pdf">.pdf</A> ]
|
341
|
-
</UL>
|
342
|
-
|
343
|
-
<!-- Output ---------------------------------------------------------------------- -->
|
344
|
-
<A name="gsiso"></a><H2>Gold Standard Input and Sample Output</H2>
|
345
|
-
|
346
|
-
<UL>
|
347
|
-
<LI>Chunk tagged data for <A HREF="cora.tagged.txt">Cora</A>, <A
|
348
|
-
HREF="citeseerx.tagged.txt">CiteSeer<SUP>X</SUP></A>, <A
|
349
|
-
HREF="flux-cim-cs.tagged.txt">FLUX-CiM</A> and humanities (<A
|
350
|
-
HREF="it-humanities.tagged.txt">Italian</A>, <A
|
351
|
-
HREF="en-humanities.tagged.txt">English</A>, and <A
|
352
|
-
HREF="mixed-humanities.tagged.txt">mixed language</A>) datasets
|
353
|
-
(suitable for ParsCit training). For FLUX-CiM data, please try
|
354
|
-
the original hosting site maintained by Eli Cortez. Credits to
|
355
|
-
Matteo Romanello for contributing the humanities datasets.
|
356
|
-
<LI> <A HREF="iconip.tagged.txt">Chunk tagged data for some ICONIP
|
357
|
-
papers</A>. Contributed by Cheong Chi Hung.
|
358
|
-
<LI>Results of running the v080917 version of ParsCit on FLUX-CiM's
|
359
|
-
dataset for [ <A HREF="flux-cim-cs.out.xml">300 computer science
|
360
|
-
references</A> ] [ <A HREF="flux-cim-med.out.xml">2000 medical
|
361
|
-
references</A> ] [ <A HREF="cora.out.xml">on the CORA dataset</A>
|
362
|
-
]. Note that these results are considered cheating as current
|
363
|
-
version has been trained on this data.
|
364
|
-
<LI> Tagged section data for the SectLabel module. <BR/> [ <A
|
365
|
-
HREF="sectLabelXML.tagged.txt">XML Format</A> ] [ <A
|
366
|
-
HREF="sectLabel.tagged.txt">Plain Text Format</A> ]<BR/>
|
367
|
-
[ <A HREF="genericSect.tagged.txt">GenericSect training data</A> ]
|
368
|
-
</UL>
|
369
|
-
|
370
|
-
<!-- Group Members ---------------------------------------------------------------------- -->
|
371
|
-
<A name="gm"></a><H2>Group Members</H2>
|
372
|
-
|
373
|
-
<UL>
|
374
|
-
<LI> <A HREF="http://www.comp.nus.edu.sg/~kanmy">Min-Yen Kan</A> - Project leader, NUS
|
375
|
-
<LI> <A HREF="http://www.personal.psu.edu/igc2/">Isaac G. Councill</A>, The Pennsylvania State University
|
376
|
-
<LI> <A HREF="http://clgiles.ist.psu.edu/">C. Lee Giles</A>, The Pennsylvania State University
|
377
|
-
<LI> <A HREF="http://wing.comp.nus.edu.sg/~lmthang">Minh-Thang Luong</A> - Research Assistant (alumnus), NUS
|
378
|
-
<LI> Yong Kiat Ng - Final year undergraduate student (graduated, 2004), NUS
|
379
|
-
<LI> Thuy Dung Nguyen - Research Assistant (alumnus), NUS
|
380
|
-
<LI> Huy Nhat Hoang Do - Research Assistant, NUS
|
381
|
-
</UL>
|
382
|
-
|
383
|
-
<!-- FAQ ---------------------------------------------------------------------- -->
|
384
|
-
<A name="faq"></a><H2>FAQ</H2>
|
385
|
-
<DL>
|
386
|
-
<DT>What platforms does ParsCit work on?</DT>
|
387
|
-
<DD>ParsCit works on all major platforms: Windows, Linux and MacOS.
|
388
|
-
The installation requires ruby and perl and the CRF++ embedded
|
389
|
-
package also requires standard UNIX utilities like sed. You
|
390
|
-
should have a working knowledge of UNIX and some experience in
|
391
|
-
installing UNIX tools. Due to our time constraints, we may not be
|
392
|
-
able answer your particular problems with installation. Do let us
|
393
|
-
know if there was something important that you had to do to get
|
394
|
-
your particular download and installation working; we'll
|
395
|
-
incorporate it into the Troubleshooting section below.</DD>
|
396
|
-
<DT>What is the difference of SectLabel and previous ParsHed?</DT>
|
397
|
-
<DD>SectLabel is a newly-developed module that further extends
|
398
|
-
ParsHed in functionality. It not only classifies header metadata,
|
399
|
-
but analyzes full documents to output the logical structure of
|
400
|
-
the internals of the document -- sections, subsections, figures,
|
401
|
-
tables, equations, footnotes and captions. <BR/> For compatibility
|
402
|
-
issues, the ParsHed module is still retained in our source code
|
403
|
-
and command line options. </DD>
|
404
|
-
|
405
|
-
<DT>How do I retrain ParsCit for a different language? I saw code in
|
406
|
-
lib/ParsCit/PreProcess' to find the beginning of the bibliography
|
407
|
-
section, and changed that but it doesn't work.</DT>
|
408
|
-
<DD>The current version does not depend on those regular expressions
|
409
|
-
anymore, they are for previous versions (e.g., v101101). ParsCit
|
410
|
-
now first labels each line using the SectLabel module and
|
411
|
-
discovers which lines to parse references for based on the first
|
412
|
-
step's output. You need to retrain SectLabel for this, by
|
413
|
-
providing labeled data about what class of line each line in your
|
414
|
-
training data is. It's also possible to "downgrade" the current
|
415
|
-
version to go back to use the rule-based method for identifying
|
416
|
-
the reference section.</DD>
|
417
|
-
<DT>What is the "genericHeader" in the output of SectLabel? What is
|
418
|
-
the difference between "genericSect.tagged" and "SectLabel.tagged"?</DT>
|
419
|
-
<DD>Generic headers, such as introduction, methodology, and
|
420
|
-
evaluation, represent generic purposes of different sections in a
|
421
|
-
scholarly article. We map all section names to generic ones
|
422
|
-
(i.e., "5. Text Features" to "Methodology"). This promotes
|
423
|
-
comparative viewing of sections with identical purpose across
|
424
|
-
articles. For the second question, actually, Generic section is
|
425
|
-
a component of SectLabel. It is responsible for classifying the
|
426
|
-
section headers of a paper into the generic categories such as
|
427
|
-
Introduction, Methodology, Result, etc. For details refer to our
|
428
|
-
IJDLS journal paper.
|
429
|
-
</DD>
|
430
|
-
<DT>Why is there an option to input file in XML format? Which DTD
|
431
|
-
should it follow?</DT>
|
432
|
-
<DD>SectLabel is a robust logical document structure inference
|
433
|
-
system that can handle both rich input (produced by OCR software
|
434
|
-
such as font or spatial features) to boost recognition
|
435
|
-
performance, but still be able to perform inference on
|
436
|
-
impoverished input (plain text) with degraded
|
437
|
-
performance. Currently, the XML input must be in the form of
|
438
|
-
output from Nuance OmniPage (version 16)'s XML format, and hence,
|
439
|
-
should follows the DTD by OmniPage. Note: The ParsCit team is not
|
440
|
-
affiliated with Nuance in any way nor does it endorse
|
441
|
-
OmniPage.</DD>
|
442
|
-
<DT> I need to run ParsCit but I can't get well-formed text from my
|
443
|
-
PDF documents. Can you help?</DT>
|
444
|
-
<DD> No, we cannot help you with this. We don't perform OCR or text
|
445
|
-
extraction from PDF documents. You will have to find your own
|
446
|
-
source for doing the extraction or conversion. We've found
|
447
|
-
Omnipage useful in our own project work (hence the possibility of
|
448
|
-
XML input), but we don't endorse any product.</DD>
|
449
|
-
<DT> The OmniPage XML doesn't seem to be well-formed. Is that OK?</DT>
|
450
|
-
<DD> Yes. The sample "XML" provided in the links (for Demo 2) are
|
451
|
-
actual outputs for a sequence of XML pages (one XML file per
|
452
|
-
page). If you use OmniPage to save an XML file for input to
|
453
|
-
ParsCit, make sure to save individual pages as separate files,
|
454
|
-
then concatenate them to send to ParsCit. You may want to
|
455
|
-
download the sample links for inspection (as they are
|
456
|
-
concatenations of several XML files, your browser will likely
|
457
|
-
complain about them not being well-formed.</DD>
|
458
|
-
<DT> I ran Demos 1 and 2 with the default "all" settings, but sections
|
459
|
-
don't seem to be detected.</DT>
|
460
|
-
<DD> There's no problem. The demo just hides the SectLabel output
|
461
|
-
by default. Click "Show SectLabel output" to reveal it.</DD>
|
462
|
-
<DT> I ran ParsCit using the OmniPage XML output, but encountered malformed UTF8 character errors.</DT>
|
463
|
-
<DD> OmniPage normally outputs XML results in UTF-16 format, a conversion into UTF-8 will solve the problem, see below: </BR>
|
464
|
-
<I> iconv --from-code UTF-16 --to-code UTF-8 omnipageOutput.xml > newOmnipageOutput.xml</I>
|
465
|
-
</DD>
|
466
|
-
</DL>
|
467
|
-
|
468
|
-
<!-- Troubleshooting ---------------------------------------------------------------------- -->
|
469
|
-
<A name="t"></a><H2>Troubleshooting</H2>
|
470
|
-
|
471
|
-
<P> A list of common problems with ParsCit. If you find problems,
|
472
|
-
email the lead developer at <kanmy@comp.nus.edu.sg>. Please use
|
473
|
-
the subject "[ParsCit]" to ensure that it reaches our attention. If
|
474
|
-
you have hand-corrected tagged data that you don't mind providing us,
|
475
|
-
we can use that to further improve ParsCit's extracting capabilities.
|
476
|
-
Nevertheless, there are problems with the output occasionally. Below
|
477
|
-
are some common problems people have encountered.
|
478
|
-
|
479
|
-
<DL>
|
480
|
-
<DT>ParsCit v110505 seems to be a lot slower when used on Omnipage
|
481
|
-
output than the previous versions, why?</DT>
|
482
|
-
<DD>You are correct. We are now using XML::Twig to do the XML
|
483
|
-
processing correctly, rather than do it ad-hoc ourselves, but this
|
484
|
-
requires constructing an exhaustive DOM tree for the Omnipage input.
|
485
|
-
This is the timesink that you are experiencing.</DD>
|
486
|
-
<DT>I'm running ParsCit on Windows but I can't get it to work, even
|
487
|
-
after installing a perl interpreter. Specifically, the
|
488
|
-
citeExtract.pl program dies complaining that it Can't open
|
489
|
-
"/tmp/...." at line 175. </DT>
|
490
|
-
<DD>ParsCit hasn't been fully tested on windows at NUS, so we can't
|
491
|
-
vouch for whether it will run correctly. In this specific error
|
492
|
-
case, the "/tmp/" directory (a standard place for temporary files in
|
493
|
-
UNIX systems) is normally not available in Windows, and may generate
|
494
|
-
problems. You may need to change the code and/or create an
|
495
|
-
appropriate directory for ParsCit to generate such files.</DD>
|
496
|
-
<DT>I tried downloading and running ParsCit but I get complaints
|
497
|
-
about /bin/sed and crf not being found. Help?</DT>
|
498
|
-
<DD>Please read the INSTALL.txt directions. You need to recompile
|
499
|
-
CRF++ for your platform. The paths included with the install are
|
500
|
-
for our version, you need to recompile to have the paths point
|
501
|
-
correctly.</DD>
|
502
|
-
<DT>When running citeExtract.pl I get some errors complaining about
|
503
|
-
the wrong ELF class of the binaries. How can I fix this?</DT>
|
504
|
-
<DD>This seems to be a problem with the compiled executables of
|
505
|
-
CRF++ bundled with the software. Follow the INSTALL instructions
|
506
|
-
but after step 1 do:
|
507
|
-
<P>
|
508
|
-
<CODE>$ cp -Rf * ../../.libs
|
509
|
-
$ cp crf_learn ../../.libs/lt-crf_learn<BR/>
|
510
|
-
$ cp crf_test ../../.libs/lt-crf_test<BR/>
|
511
|
-
</CODE></DD>
|
512
|
-
<DT>I'm trying to install parscit v110505 using the instructions in the install file, and when I get to the point where you're supposed to recompile CRF, it exists with an error:<BR/>
|
513
|
-
|
514
|
-
<PRE>In file included from node.h:13:0,
|
515
|
-
from node.cpp:9:
|
516
|
-
path.h:26:52: error: 'size_t' has not been declared
|
517
|
-
make[1]: *** [node.lo] Error 1
|
518
|
-
make[1]: Leaving directory `/home/agarnett/parscit/crfpp/CRF++-0.51'
|
519
|
-
make: *** [all] Error 2</PRE><BR/>
|
520
|
-
The install file mentions that this may fail the first time; unfortunately for me, it keeps failing. any help?</DT>
|
521
|
-
<DD>The error is from CRF++ package (not from ParsCit), there are two ways to fix it:<BR/>
|
522
|
-
1. Add the line. <CODE>#include<iostream></CODE> in node.cpp and compile crf++ again, or;<BR/>
|
523
|
-
2. Go to <A HREF="http://crfpp.googlecode.com/svn/trunk/doc/index.html">http://crfpp.googlecode.com/svn/trunk/doc/index.html</A> and download the latest version. The instruction is the same. Hope this helps.</DD>
|
524
|
-
<DT>Issue numbers don't get extracted.</DT>
|
525
|
-
<DD><SPAN CLASS="red">This issue should be fixed as of the v110505
|
526
|
-
release.</SPAN> There is now some heuristic postprocessing code to
|
527
|
-
take care of breaking single or multiple tokens for issues and
|
528
|
-
volumes. </DD>
|
529
|
-
<DT>Separation of author names and publishing year fails</DT>
|
530
|
-
<DD> In some reference data with non-standard sequences of
|
531
|
-
first names and family names, e.g.
|
532
|
-
<pre>
|
533
|
-
Baltes, Paul, Ursula Staudinger, Ulmann Lindenberger (1999): Lifespan
|
534
|
-
psychology: theory and application of intellectual functioning; in:
|
535
|
-
Annual Review of Psychology, 50, 471-507
|
536
|
-
</pre>
|
537
|
-
ParsCit's post processing step may not detect and deal with these
|
538
|
-
problems reliably. We're working to fix these too. </DD>
|
539
|
-
<DT>I passed ParsCit plain text output but in another, non-English
|
540
|
-
language. I didn't get good results or I got empty results. Can
|
541
|
-
you help? </DT>
|
542
|
-
<DD>Aside from English, ParsCit can handle Italian and German to a
|
543
|
-
limited extent, thanks to the multilingual training data.
|
544
|
-
However, the demo web interface uploads non-ASCII (e.g., UTF-8 or
|
545
|
-
UTF-16 data) as binary data and fails to execute ParsCit.
|
546
|
-
However, if you download a copy of ParsCit, the libraries do work
|
547
|
-
on such data. Here's a <A
|
548
|
-
HREF="humanities.test.out.xml">sample</A>. We'd love to help make
|
549
|
-
a more universal model that can accommodate reference strings in
|
550
|
-
other languages. If you're willing to help contribute ground
|
551
|
-
truth data, we love to hear from you!</DD>
|
552
|
-
|
553
|
-
<DT>How about retraining ParsCit for another language/domain?</DT>
|
554
|
-
<DD>You can put your supervised exemplar data into the same format
|
555
|
-
as tagged_references.txt found in crfpp/traindata/. Once you have
|
556
|
-
this file you can generate the appropriate model for ParsCit, by
|
557
|
-
using three commands (assumes you are in the crfpp/traindata
|
558
|
-
directory):
|
559
|
-
<P>
|
560
|
-
<CODE>$ ../../bin/tr2crfpp.pl tagged_references.txt > parsCit.train.data
|
561
|
-
<BR/>
|
562
|
-
$ ../crf_learn parsCit.template parsCit.train.data model
|
563
|
-
<BR/>
|
564
|
-
$ mv model ../../resources/parsCit.model
|
565
|
-
</CODE>
|
566
|
-
<P>The first command creates the input feature file that crfpp uses
|
567
|
-
from the training data. The second creates the model using the
|
568
|
-
crf_learn command. You can then move the model file to the
|
569
|
-
resources/ subdirectory where it can be utilized. To replace the
|
570
|
-
default model that comes with ParsCit, just execute the final
|
571
|
-
command. </DD>
|
572
|
-
<DT> Can I retrain the package for a different set of tags if I
|
573
|
-
change the tagset in the training data?</DT>
|
574
|
-
<DD> Yes, you should be able to change the tagset to suit your
|
575
|
-
dataset. You can add, eliminate and change the tagset as you
|
576
|
-
wish. You need to retrain the parser system after creating your
|
577
|
-
tag data. For more details on the training process, see the
|
578
|
-
documentation for CRF++, that is on the web at sourceforge.
|
579
|
-
</DD>
|
580
|
-
<DT>When retraining I get a "bad_alloc" error. What gives?</DT>
|
581
|
-
<DD>We're not entirely sure of this. CRF training is quite memory
|
582
|
-
intensive and running a large amount of training data tuples may
|
583
|
-
cause the embedded CRF++ package to fail. You can try with less
|
584
|
-
training data, or try training on a machine with a larger amount
|
585
|
-
of RAM. </DD>
|
586
|
-
<DT>Does the web service actually work? I can't seem to run it.</DT>
|
587
|
-
<DD>Occasionally our school's networking staff changes the firewall
|
588
|
-
settings, so the port for our group's web services may be blocked
|
589
|
-
(port 4000 on host wing.comp.nus.edu.sg). If you find you can't
|
590
|
-
reach our services (they time out), please let us know. </DD>
|
591
|
-
<DT>I get funny errors with crf_test not being useful. How do I
|
592
|
-
fix this?</DT> <DD>The updated README.txt file in the 090625b
|
593
|
-
distribution fixes this. Basically you need to recompile CRF++
|
594
|
-
0.51 and place the libraries and the executables in the proper
|
595
|
-
place. See the README for details.</DD>
|
596
|
-
|
597
|
-
</DL>
|
598
|
-
|
599
|
-
<!-- Kudos ---------------------------------------------------------------------- -->
|
600
|
-
<H2>Kudos</H2>
|
601
|
-
|
602
|
-
<p>ParsCit owes its continued maintenance and support from its user
|
603
|
-
base. Here we'd like to thank them for their help.</p>
|
604
|
-
|
605
|
-
<P>Thanks to David Judd who reconfigured how CRF++ is located with
|
606
|
-
respect to the main code. Thanks to Alex Garnett in spotting more
|
607
|
-
problems with CRF dependencies. Thanks to George E. Raptis and Eric
|
608
|
-
Tran for the port to Windows. Thanks to Zhu Ying-Bo
|
609
|
-
(yumichika@163.com) from the Language Computing and Web Mining Group,
|
610
|
-
Institute of Computer Science and Technology of Peking University for
|
611
|
-
the partial port to Windows. Thanks to Yustus Oktian for questions
|
612
|
-
about training for another language. Thanks to Madhur Kapoor for
|
613
|
-
asking questions about PDF conversion. Thanks to Behrang Qasemizadeh
|
614
|
-
for reporting problems with truncation of XML entities in XML output
|
615
|
-
(v110505). Thanks Tim Brody for his BiblioScript patch. Thanks to
|
616
|
-
David Jurgens for suggesting that remove temporary files after runs
|
617
|
-
(v110505). Thanks Nikolay Nikolov for suggesting the conversion of
|
618
|
-
OmniPage XML results from UTF-16 to UTF-8 to avoid encoding
|
619
|
-
problems. Thanks to Matteo Romanello for the suggestion and permission
|
620
|
-
to incorporate BiblioScript software (v101101). Many thanks to Kris
|
621
|
-
Jack for pointing out problems with the ELF binaries and an
|
622
|
-
appropriate fix. Thanks to Cheong Chi Hong for fixing problems with
|
623
|
-
Preprocess.pm (v100401) and contributing the ICONIP training data and
|
624
|
-
XML entity problems in reference string parsing (v100401). Thanks to
|
625
|
-
Priya Venkateshan for pointing out sudo/root installation
|
626
|
-
possibilities (v100401). Thanks to Mario Lipinski for reporting
|
627
|
-
punctuation stripping problems in reference string parsing (v100401).
|
628
|
-
Thanks to Artemy Kolchinsky for fixes in Preprocess.pm
|
629
|
-
(v090625). Thanks to Matteo Romanello for the humanities training
|
630
|
-
datasets. Thanks to Dain Kaplan for helping us fix the Preprocess.pm
|
631
|
-
bug. Thanks to Ayeh Bandeh-Ahmadi for correcting the warning in
|
632
|
-
parseRefString.pl. Thanks to Nick Friedrich and Jöran Beel of
|
633
|
-
scienstein.org for all fixes in the v081201 version of ParsCit. Also
|
634
|
-
thanks to Madian Khabsa for indicating problems with installation to
|
635
|
-
MacOS.</p>
|
636
|
-
|
637
|
-
<P>ParsCit is used by many projects worldwide, and not just in
|
638
|
-
experimental, research and academic places, but in commercial
|
639
|
-
snterprises as well. <A HREF="http://www.mendeley.com/">Mendeley</A>
|
640
|
-
is using ParsCit to parse references from contributed papers, as is
|
641
|
-
the <A HREF="http://citec.repec.org/">Citations in Economics
|
642
|
-
(CitEc)</A> project.
|
643
|
-
|
644
|
-
|
645
|
-
<!-- Related Links ---------------------------------------------------------------------- -->
|
646
|
-
<H2>Related Links</H2>
|
647
|
-
|
648
|
-
<P>Other, open-source citation parsers:
|
649
|
-
|
650
|
-
<UL>
|
651
|
-
<LI> <A
|
652
|
-
HREF="http://freecite.library.brown.edu/welcome">FreeCite</A>:
|
653
|
-
supported by the Mellon Foundation and Brown University. Written in
|
654
|
-
Ruby on Rails, with the same CRF++ backend.
|
655
|
-
<LI> An <A
|
656
|
-
HREF="http://purl.net/net/egh/hmm-citation-extractor/">Hidden Markov
|
657
|
-
Model Citation Extractor</A>: written by Erik Hetzner of the
|
658
|
-
California Digital Library.
|
659
|
-
</UL>
|
660
|
-
|
661
|
-
<P> Other related links. Contact Min below to get your other related
|
662
|
-
software listed here. Thanks!
|
663
|
-
|
664
|
-
<UL>
|
665
|
-
<LI> Perhaps you're interested in open source code for libraries?
|
666
|
-
If so try the <A
|
667
|
-
HREF="http://dewey.library.nd.edu/mailing-lists/code4lib/">CODE4LIB
|
668
|
-
mailing list</A>.
|
669
|
-
|
670
|
-
<LI> <A
|
671
|
-
HREF="https://wiki.birncommunity.org:8443/display/NEWBIRNCC/LATISI+-+Literature+Annotation+Tool+from+the+Information+Sciences+Institute">LATISI
|
672
|
-
- Literature Annotation Tool from the Information Sciences
|
673
|
-
Institute</A>. A related project from ISI, using MBL instead of CRF.
|
674
|
-
<LI> <A HREF="http://www.scienstein.org">Scienstein.org</A>: A
|
675
|
-
recommendation system for papers.
|
676
|
-
<LI> PdfBox: An open-source package for extracting text information
|
677
|
-
from PDF files. Does not deal with problems with custom font
|
678
|
-
encodings.
|
679
|
-
</UL>
|
680
|
-
|
681
|
-
<HR>
|
682
|
-
<H5><ADDRESS><A HREF="http://www.comp.nus.edu.sg/~kanmy">Min-Yen Kan</A> <<A HREF="mailto:kanmy@comp.nus.edu.sg">kanmy@comp.nus.edu.sg</A>></ADDRESS>
|
683
|
-
Created on: Fri Dec 24 01:48:05 SGT 2004
|
684
|
-
<!-- hhmts start -->
|
685
|
-
| Version: 1.0
|
686
|
-
|
687
|
-
| Last modified:
|
688
|
-
Mon Mar 4 14:23:46 SGT 2013
|
689
|
-
<!-- hhmts end -->
|
690
|
-
</H5>
|
691
|
-
</div>
|
692
|
-
</BODY> </HTML>
|