biblicit 2.2.1 → 2.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -13,7 +13,7 @@ Note: The version is 2.x, but really should be 0.2.x.
13
13
  result = Biblicit::Extractor.extract(content: "a string containing the content of a PDF file")
14
14
 
15
15
  # Extract metadata from a file using all available tools
16
- result = Biblicit::Extractor.extract(file: "myfile.pdf", tools: [:citeseer, :parshed, :cb2bib], remote: true, token: false)
16
+ result = Biblicit::Extractor.extract(file: "myfile.pdf", tools: [:parshed, :cb2bib], remote: true, token: false)
17
17
 
18
18
  # See reference information for "myfile.pdf"
19
19
  result[:citeseer][:title]
@@ -139,7 +139,7 @@ You can specify where you have installed CRF++ by setting the CRFPP_HOME environ
139
139
 
140
140
  sudo apt-add-repository 'deb http://cl.naist.jp/~eric-n/ubuntu-nlp oneiric all'
141
141
  sudo apt-get update
142
- sudo apt-get install libcrf++ crf++
142
+ sudo apt-get install libcrf++-dev crf++
143
143
 
144
144
  ##### On OS X with Homebrew
145
145
 
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Biblicit
4
4
 
5
- VERSION = '2.2.1'
5
+ VERSION = '2.2.2'
6
6
 
7
7
  end
@@ -53,8 +53,8 @@ $tmpfile .= $$ . time;
53
53
 
54
54
  # Untaint tmpfile variable
55
55
  if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; }
56
-
57
- $tmpfile = "/tmp/" . $tmpfile;
56
+ my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
57
+ $tmpfile = "$tmpdir/$tmpfile";
58
58
  $0 =~ /([^\/]+)$/;
59
59
  my $progname = $1;
60
60
 
@@ -379,7 +379,8 @@ sub BiblioScript
379
379
  my ($types, $pc_xml, $outfile) = @_;
380
380
 
381
381
  my @export_types = @{ $types };
382
- my $tmp_dir = "/tmp/" . NewTmpFile();
382
+ my $base_tmp_dir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
383
+ my $tmp_dir = $base_tmp_dir . "/" . NewTmpFile();
383
384
  system("mkdir -p $tmp_dir");
384
385
 
385
386
  # Write extract_citation output to a tmp file
@@ -10,7 +10,8 @@ use FindBin;
10
10
  my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g;
11
11
  $tmpfile .= $$ . time;
12
12
  if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; } # untaint tmpfile variable
13
- $tmpfile = "/tmp/" . $tmpfile;
13
+ my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
14
+ $tmpfile = "$tmpdir/$tmpfile";
14
15
  $0 =~ /([^\/]+)$/; my $progname = $1;
15
16
  my $outputVersion = "1.0";
16
17
 
@@ -29,7 +29,8 @@ use strict 'vars';
29
29
  my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g;
30
30
  $tmpfile .= $$ . time;
31
31
  if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; } # untaint tmpfile variable
32
- $tmpfile = "/tmp/" . $tmpfile;
32
+ my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
33
+ $tmpfile = "$tmpdir/$tmpfile";
33
34
  $0 =~ /([^\/]+)$/; my $progname = $1;
34
35
  my $outputVersion = "1.0";
35
36
  ### END user customizable section
@@ -5,7 +5,8 @@
5
5
  my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g;
6
6
  $tmpfile .= $$ . time;
7
7
  if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; } # untaint tmpfile variable
8
- $tmpfile = "/tmp/" . $tmpfile;
8
+ my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
9
+ $tmpfile = "$tmpdir/$tmpfile";
9
10
  $0 =~ /([^\/]+)$/; my $progname = $1;
10
11
  my $outputVersion = "1.0";
11
12
  my $parscitHome = "/home/wing.nus/services/parscit/tools/";
@@ -8,7 +8,7 @@ pwd = File.dirname(__FILE__)
8
8
  @CRFPP = ENV['CRFPP_HOME'] ? "#{ENV['CRFPP_HOME']}/bin" : "#{pwd}/../../crfpp"
9
9
  @SRC = "#{pwd}/genericSect"
10
10
  @DATA = "#{pwd}/../../resources/sectLabel/"
11
- @TEST_DIR = "/tmp/"
11
+ @TEST_DIR = ENV['PARSCIT_TMPDIR'] || "/tmp"
12
12
 
13
13
  require "#{@SRC}/forceUtf8"
14
14
 
@@ -29,7 +29,8 @@ use FindBin;
29
29
  my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g;
30
30
  $tmpfile .= $$ . time;
31
31
  if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; } # untaint tmpfile variable
32
- $tmpfile = "/tmp/" . $tmpfile;
32
+ my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
33
+ $tmpfile = "$tmpdir/$tmpfile";
33
34
  $0 =~ /([^\/]+)$/; my $progname = $1;
34
35
  my $outputVersion = "1.0";
35
36
  my $dictFile = "$FindBin::Bin/../resources/parsCitDict.txt";
@@ -33,7 +33,7 @@ $Resource_Dir = "$parscitHome/resources/headerParse";
33
33
  $Database_Dir = "$Resource_Dir/database/";
34
34
  $Data_Dir = "$Resource_Dir/data/";
35
35
  $offlineD = "$Resource_Dir/models/";
36
- $Tmp_Dir = "$parscitHome/tmp";
36
+ $Tmp_Dir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
37
37
 
38
38
  $nMinHeaderLength = 50;
39
39
  $nMaxHeaderLength = 2500;
@@ -982,8 +982,8 @@ sub BuildTmpFile
982
982
  ###
983
983
  # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
984
984
  ###
985
- return "/tmp/$tmpfile";
986
- # return $tmpfile;
985
+ my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
986
+ return "$tmpdir/$tmpfile";
987
987
  }
988
988
 
989
989
  sub Fatal
@@ -329,9 +329,9 @@ sub buildTmpFile {
329
329
  if ($tmpfile =~ /^([-\@\w.]+)$/) {
330
330
  $tmpfile = $1;
331
331
  }
332
-
333
- # return $tmpfile;
334
- return "/tmp/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
332
+
333
+ my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
334
+ return "$tmpdir/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
335
335
 
336
336
  } # buildTmpFile
337
337
 
@@ -216,8 +216,8 @@ sub buildTmpFile {
216
216
  $tmpfile = $1;
217
217
  }
218
218
 
219
- # return $tmpfile;
220
- return "/tmp/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
219
+ my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
220
+ return "$tmpdir/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
221
221
 
222
222
  } # buildTmpFile
223
223
 
@@ -1930,7 +1930,8 @@ sub BuildTmpFile
1930
1930
  $tmpfile = $1;
1931
1931
  }
1932
1932
 
1933
- return "/tmp/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
1933
+ my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
1934
+ return "$tmpdir/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
1934
1935
  }
1935
1936
 
1936
1937
  1;
@@ -212,7 +212,7 @@ sub GetGenericHeaders
212
212
  my $num_headers = scalar(@{ $headers });
213
213
 
214
214
  # Put the list of headers to file
215
- my $header_file = "/tmp/" . NewTmpFile();
215
+ my $header_file = NewTmpFile();
216
216
 
217
217
  $generic_sect_path = UntaintPath($generic_sect_path);
218
218
 
@@ -328,7 +328,9 @@ sub NewTmpFile
328
328
 
329
329
  chomp($tmpfile);
330
330
  $tmpfile = UntaintPath($tmpfile);
331
- return $tmpfile;
331
+
332
+ my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
333
+ return "$tmpdir/$tmpfile";
332
334
  }
333
335
 
334
336
  1;
@@ -1079,8 +1079,9 @@ sub BuildTmpFile
1079
1079
  {
1080
1080
  $tmpfile = $1;
1081
1081
  }
1082
-
1083
- return "/tmp/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
1082
+
1083
+ my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp";
1084
+ return "$tmpdir/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
1084
1085
  }
1085
1086
 
1086
1087
 
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: biblicit
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 2.2.1
5
+ version: 2.2.2
6
6
  platform: ruby
7
7
  authors:
8
8
  - David Judd
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-05-08 00:00:00.000000000 Z
12
+ date: 2013-05-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  prerelease: false
@@ -153,7 +153,6 @@ files:
153
153
  - parscit/bin/sectLabel/single2multi.pl
154
154
  - parscit/bin/sectLabel/tr2crfpp.pl
155
155
  - parscit/bin/tr2crfpp.pl
156
- - parscit/doc/index.html
157
156
  - parscit/lib/CSXUtil/SafeText.pm
158
157
  - parscit/lib/HeaderParse/API/AssembleXMLMetadata.pm
159
158
  - parscit/lib/HeaderParse/API/Function.pm
@@ -435,7 +434,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
435
434
  - !ruby/object:Gem::Version
436
435
  segments:
437
436
  - 0
438
- hash: 782848681955337634
437
+ hash: -1109935214577805230
439
438
  version: '0'
440
439
  none: false
441
440
  required_rubygems_version: !ruby/object:Gem::Requirement
@@ -444,7 +443,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
444
443
  - !ruby/object:Gem::Version
445
444
  segments:
446
445
  - 0
447
- hash: 782848681955337634
446
+ hash: -1109935214577805230
448
447
  version: '0'
449
448
  none: false
450
449
  requirements:
@@ -1,692 +0,0 @@
1
- <!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 3.0//EN">
2
-
3
- <HTML><HEAD>
4
- <META HTTP-EQUIV="Keywords" NAME="Keywords" CONTENT="citations, citation parser, references, reference parser, bibliography parser, reference string parser, bibtex, citation, citation extraction, logical structure, document logical structure">
5
- <LINK REL="stylesheet" HREF="parsCit.css"><TITLE>ParsCit: An open-source CRF Reference String and Logical Document Structure Parsing Package</TITLE>
6
- <script type="text/javascript">
7
- function toggleLayer( whichLayer ) {
8
- var elem, vis;
9
- if( document.getElementById ) // this is the way the standards work
10
- elem = document.getElementById( whichLayer );
11
- else if( document.all ) // this is the way old msie versions work
12
- elem = document.all[whichLayer];
13
- else if( document.layers ) // this is the way nn4 works
14
- elem = document.layers[whichLayer];
15
- vis = elem.style;
16
- // if the style.display value is blank we try to figure it out here
17
- if(vis.display==''&&elem.offsetWidth!=undefined&&elem.offsetHeight!=undefined)
18
- vis.display = (elem.offsetWidth!=0&&elem.offsetHeight!=0)?'block':'none';
19
- vis.display = (vis.display==''||vis.display=='block')?'none':'block';
20
- }
21
- </script>
22
- </HEAD><BODY BGCOLOR="#FFFFFF">
23
-
24
- <div id="leftcontent">
25
- [&nbsp;<A href="http://wing.comp.nus.edu.sg/">WING homepage</A>&nbsp;]<br/>
26
- [&nbsp;<A href="http://wing.comp.nus.edu.sg/portal/web-services.html">WING web services</A>&nbsp;]<br/>
27
- <BR/>
28
- [&nbsp;<A href="#d">Download</a>&nbsp;]<br/>
29
- [&nbsp;<A href="#ws">Web Service</a>&nbsp;]</br/>
30
- [&nbsp;<A href="#wd">Web Demo</a>&nbsp;]<br/>
31
- [&nbsp;<A href="#p">Publications</a>&nbsp;]<br/>
32
- [&nbsp;<A href="#gsiso">Input and Output</a>&nbsp;]<br/>
33
- [&nbsp;<A href="#gm">Group Members</a>&nbsp;]<br/>
34
- [&nbsp;<A href="#faq">FAQ</a>&nbsp;]<br/>
35
- [&nbsp;<A href="#t">Troubleshooting</a>&nbsp;]<br/>
36
- <br/>
37
- <script type="text/javascript"
38
- src="http://feedjit.com/serve/?vv=932&tft=3&dd=0&wid=0804fba767d140cd&pid=0&proid=0&bc=FFFFFF&tc=000000&brd1=012B6B&lnk=135D9E&hc=FFFFFF&hfc=2853A8&btn=C99700&ww=200&wne=10&wh=WING+Live+Traffic+Feed&hl=0&hlnks=0&hfce=0&srefs=1&hbars=0"></script><noscript><a
39
- href="http://feedjit.com/">Feedjit Live Blog Stats</a></noscript>
40
- </div>
41
-
42
- <div id="centercontent">
43
- <IMG ALIGN="LEFT" SRC="parsCit.png" WIDTH="200px" ALT="Picture of ParsCit Swami">
44
-
45
- <CENTER><H1>ParsCit: An open-source CRF Reference String and Logical Document Structure Parsing Package</H1></CENTER>
46
-
47
- <P>This is the home page of the ParsCit project, which performs two
48
- tasks: 1) reference string parsing, sometimes also called citation
49
- parsing or citation extraction, and 2) logical structure parsing of
50
- scienfific documents. It is architected as a supervised machine
51
- learning procedure that uses Conditional Random Fields as its learning
52
- mechanism. You can download the code below, parse strings online, or
53
- send batch jobs to our web service. The code contains both the
54
- training data, feature generator and shell scripts to connect the
55
- system to a web service (used on this web site).</P>
56
-
57
- <P>Some definitions (thanks to Robert Dale for Citations and Reference
58
- Strings):
59
-
60
- <DL>
61
-
62
- <DT>Reference String:</DT><DD>A text string in the bibliography or
63
- reference section of a work, usually at the end of the document that
64
- refers to a unique document. Usually occurs with other reference
65
- strings that point to other documents. May also appear as
66
- footnotes.</DD>
67
-
68
- <DT>Citation:</DT><DD>A text string (usually explicit) in the
69
- document body that points to a corresponding reference string at the
70
- end of the document. Several citations may co-refer to a single
71
- reference string.</DD>
72
-
73
- <DT>Document Logical Structure:</DT><DD> A hierarchy of logical
74
- components, for example, titles, authors, affiliations, abstracts,
75
- sections, etc., according to (Mao, Rosenfeld &amp;
76
- Kanungo,2003). Our logical structure is more comprehensive,
77
- comprising not only header metadata and references, but also the
78
- logical structure of the internals of the document -- sections,
79
- subsections, figures, tables, equations, footnotes and
80
- captions. </DD>
81
-
82
- </DL>
83
-
84
- <P>This project deals with the problem of parsing the reference
85
- strings and parsing the logical structure of a document. The first
86
- task is handled by a module with the project namesake, ParsCit, and
87
- the second task by a separate module SectLabel.
88
- </P>
89
-
90
- <br clear="all"/>
91
- <!-- License ---------------------------------------------------------------------- -->
92
- <A NAME="l"></A><H2>License</H2>
93
-
94
- <P>This software is licensed under the <A
95
- HREF="http://www.gnu.org/copyleft/lesser.html">Lesser GNU Public
96
- License</A> (LGPL), which means you are free to use it for any
97
- purpose, including embedding in commercial products. </P>
98
-
99
- <br clear="all" />
100
- <!-- Download ---------------------------------------------------------------------- -->
101
- <A NAME="d"></A><H2>Download</H2>
102
-
103
- <P>You can download the open-source code for ParsCit here. The source requires you to re-compile the CRFPP source code
104
- and assumes that perl is installed on your system and can be invoked
105
- using <CODE>perl</CODE> (must be in your path).
106
- </P>
107
-
108
- <ul>
109
-
110
- <li> Current version <A HREF="parscit-110505b.zip">110505b</A>: Added XML::Twig for XML processing. ParsCit now uses input provided by SectLabel. See <A HREF="CHANGELOG.txt"> CHANGELOG.txt </A>.<BR/>
111
- The (partially ported) <A HREF="parscit-110505b-win.zip">Windows</A> version is here (provided by Yumichika). See the <A HREF="CHANGES%20FOR%20WINDOWS.txt">CHANGES FOR WINDOWS.txt</A>
112
- <BR/>
113
- <BR/>
114
- We have also pushed a copy of the ParsCit current distribution into <A HREF="http://www.github.com/knmnyn/parscit">GitHub:knmnyn/parscit</A>.
115
- The Windows version has also been pushed to <A HREF="http://www.github.com/wing-nus/parscit">GitHub:wing-nus/parscit</A>.
116
-
117
- While we'll strive to keep the GitHub version as updated as possible, the versions on this page will remain the most authoritative for major updates.
118
- <BR/>
119
- <li> Other versions: <BR/>
120
- <A HREF="parscit-101101.zip">101101</A>: Incorporated <A HREF="http://github.com/mromanello/BiblioScript">BiblioScript</A> and <A HREF=http://www.scripps.edu/~cdputnam/software/bibutils>BibUtils</A> software. See CHANGELOG.txt; <BR/>
121
- <A HREF="parscit-100401.zip">100401d</A>: Added SectLabel (logical structure parsing) software from the NUS team, and Iconip training data from Cheong Chi Hong for ParsCit with new ParsCit model retrained. See CHANGELOG.txt; <BR/>
122
- <A HREF="parscit-090625.zip">090625b</A>: Added documentation for complete re-installation. Improved ParsHed with added line-level CRF model together and post-processing module by NUS team, WSDL file and client for service at NUS and minor bug fixes for ParsCit. See CHANGELOG.txt; <BR/>
123
- <A HREF="parscit-090316.zip">090316</A>: Incorporation of ParsHed (header parsing) software from the NUS team. See CHANGELOG.txt; <BR/>
124
- <A HREF="parscit-081201.zip">081201</A>: Bug fixes and incorporation of byte position offset from the Scienstein.org team. See CHANGELOG.txt; <BR/>
125
- <A HREF="parscit-080917.zip">080917</A>: Minor changes (improved models and mulilingual support), see CHANGELOG.txt; <BR/>
126
- <A HREF="parscit-080402.zip">080402</A>: First public release. Comes with precompiled linux binaries for CRF++; <BR/>
127
- <A HREF="parscit-080310.tgz">080310</A>: Beta release.
128
-
129
- <li><A HREF="http://crfpp.sourceforge.net">CRF++</A>: A conditional random fields toolkit that you may need to install, if the compiled one does not work for you. We recommend that you use version 0.51. </ul>
130
-
131
- <!-- Web Service ---------------------------------------------------------------------- -->
132
- <A name="ws"></a><H2>Web Service</h2>
133
-
134
- <P>More NLP services are now being made available on the web.
135
- Following this trend you can send your plain text citations to use via
136
- our web service. We will parse these for you free of charge (as and
137
- when time and processing power allows, these processes are done with
138
- lower priority).</P>
139
-
140
- <P CLASS="red">N.B. We keep logs of what's parsed in these demos, to
141
- improve the accuracy and productivity of ParsCit. If you'd like these
142
- to be kept private or you find you use this service a lot, why not
143
- install a local copy of ParsCit for yourself? If you do, please
144
- let us know where you are so we acknowledge you here and can re-direct
145
- some traffic your way.
146
- </P>
147
-
148
- <UL>
149
- <LI> <A HREF="wing.nus.wsdl">Download the WSDL file</A> for the service at NUS.
150
- <LI> <A HREF="ParsCitClientWSDL.rb">Download the sample ruby client
151
- that uses the WSDL file</A> to dynamically generate the ParsCit web
152
- service call to the NUS server. Edit the file to see how to
153
- execute it.
154
- <LI> <A HREF="ParsCitClient.rb">Download sample ruby client code</a>
155
- for the ParsCit web service at the NUS server. To execute,
156
- just point it at a local
157
- text file that represents the text dump of a scholarly article
158
- (such as one produced by a PDF to text converter):
159
- <CODE>
160
- ./ParsCitClient.rb ~/public_html/samples/E06-1050.txt
161
- </CODE>
162
- <LI><FORM METHOD="post" ACTION="parsCit.cgi"><INPUT TYPE="HIDDEN"
163
- NAME="ping" VALUE="ping"><INPUT TYPE="SUBMIT" VALUE="Check"> whether
164
- the web service is up.
165
- </FORM>
166
- </UL>
167
-
168
- <!-- Web demo ----------------------------------------------------------------------- -->
169
- <A name="wd"></a><H2>Web-based Demonstration</H2>
170
-
171
- <P CLASS="red">N.B.: We keep logs of what's parsed in these demos, to
172
- improve the accuracy and productivity of ParsCit. If you'd like these
173
- to be kept private, why not install a local copy of ParsCit for
174
- yourself?</P>
175
-
176
- <P>You can also run ParsCit directly in your browser. The form below
177
- submits your text input (after suitable cleaning) to the ParsCit
178
- service to parse the input file or strings. <FONT COLOR="red">
179
- Note that if system loads gets high, your demo call may not be executed. If you want to run this program in batch, please download your own copy.</FONT>
180
- </P>
181
-
182
- <P><B>Demo #1: Parsing the header, logical structure and/or reference strings (and citation contexts) from a text file</B></P>
183
-
184
- <DIV STYLE="background-color:D0D0FF; padding: 1em">
185
- <FORM ENCTYPE="multipart/form-data" METHOD="post" ACTION="parsCit.cgi">
186
- <P>NB - this demo does not handle PDF input at this time. You can use another web service or software to convert PDFs to text. </P>
187
- <P style="font-size:small;"><I>Internal key (if applicable):</I> <INPUT TYPE="password" SIZE="4" NAME="key"></P>
188
- <INPUT TYPE="text" SIZE="80" NAME="demo" value="1" style="display:none;">
189
- <P>Input Method 1) Enter a URL to a file on the web (e.g., <A HREF="http://wing.comp.nus.edu.sg/~wing.nus/samples/E06-1050.txt">http://wing.comp.nus.edu.sg/~wing.nus/samples/E06-1050.txt</A> or <A HREF="http://wing.comp.nus.edu.sg/~wing.nus/samples/W06-0102.txt">http://wing.comp.nus.edu.sg/~wing.nus/samples/W06-0102.txt</A>).<BR/>
190
- <INPUT TYPE="text" SIZE="80" NAME="urlfile">
191
- </P>
192
-
193
- <P>Input Method 2) Upload a .txt file (ASCII; UTF-8)<BR/>
194
- <INPUT TYPE="FILE" NAME="datafile">
195
- </P>
196
-
197
- <P>Input Method 3) Paste the whole file here:
198
- <br/>
199
- <TEXTAREA ROWS="4" COLS="80" NAME="textfile">
200
- </TEXTAREA>
201
- </P>
202
- <P>Parse the document using the following options
203
- <SELECT NAME="ParsCitOptions">
204
- <OPTION SELECTED VALUE="5">all</OPTION>
205
- <OPTION VALUE="1">citations</OPTION>
206
- <OPTION VALUE="2">header</OPTION>
207
- <OPTION VALUE="4">section</OPTION>
208
- </SELECT>
209
- </P>
210
-
211
- <P>Citation export formats
212
- <INPUT TYPE=CHECKBOX NAME="ads1">ADS
213
- <INPUT TYPE=CHECKBOX NAME="bib1" CHECKED>BIB
214
- <INPUT TYPE=CHECKBOX NAME="end1">EndNote
215
- <INPUT TYPE=CHECKBOX NAME="isi1">ISI
216
- <INPUT TYPE=CHECKBOX NAME="ris1">RIS
217
- <INPUT TYPE=CHECKBOX NAME="wordbib1">WordBib
218
- </P>
219
-
220
-
221
- <br/><CENTER><INPUT TYPE="SUBMIT" VALUE="Parse this file!"></CENTER>
222
- </FORM>
223
- </DIV>
224
-
225
- <P><B>Demo #2: As above but using XML input (XML must conform to Omnipage output). This demo is slow so please be patient.</B></P>
226
- <DIV STYLE="background-color:D0D0FF; padding: 1em">
227
- <FORM ENCTYPE="multipart/form-data" METHOD="post" ACTION="parsCit.cgi">
228
- <INPUT TYPE="text" SIZE="80" NAME="demo" value="2" style="display:none;">
229
- <P style="font-size:small;"><I>Internal key (if applicable):</I> <INPUT TYPE="password" SIZE="4" NAME="key"></P>
230
- <P>Input Method 1) Enter a URL to a file on the web (e.g., <A HREF="http://wing.comp.nus.edu.sg/~wing.nus/samples/E06-1050.xml">http://wing.comp.nus.edu.sg/~wing.nus/samples/E06-1050.xml</A> or <A HREF="http://wing.comp.nus.edu.sg/~wing.nus/samples/W06-0102.xml">http://wing.comp.nus.edu.sg/~wing.nus/samples/W06-0102.xml</A>).<BR/>
231
- <INPUT TYPE="text" SIZE="80" NAME="urlfile">
232
- </P>
233
-
234
- <P>Input Method 2) Upload a .xml file (ASCII; UTF-8)<BR/>
235
- <INPUT TYPE="FILE" NAME="datafile">
236
- </P>
237
-
238
- <P>Input Method 3) Paste the whole .xml file here:
239
- <br/>
240
- <TEXTAREA ROWS="4" COLS="80" NAME="textfile">
241
- </TEXTAREA>
242
- </P>
243
-
244
- <P>Input Method 4) Upload your own .pdf file (less than 50 pages & smaller than 10MB):
245
- <br/>
246
- <INPUT TYPE="FILE" NAME="pdffile">
247
- </P>
248
-
249
- <P>Parse the document using the following options
250
- <SELECT NAME="ParsCitOptions">
251
- <OPTION SELECTED VALUE="5">all</OPTION>
252
- <OPTION VALUE="1">citations</OPTION>
253
- <OPTION VALUE="2">header</OPTION>
254
- <OPTION VALUE="4">section</OPTION>
255
- </SELECT>
256
- </P>
257
- <P>Citation export formats
258
- <INPUT TYPE=CHECKBOX NAME="ads2">ADS
259
- <INPUT TYPE=CHECKBOX NAME="bib2" CHECKED>BIB
260
- <INPUT TYPE=CHECKBOX NAME="end2">EndNote
261
- <INPUT TYPE=CHECKBOX NAME="isi2">ISI
262
- <INPUT TYPE=CHECKBOX NAME="ris2">RIS
263
- <INPUT TYPE=CHECKBOX NAME="wordbib2">WordBib
264
- </P>
265
-
266
- <br/><CENTER><INPUT TYPE="SUBMIT" VALUE="Parse this file!"></CENTER>
267
- </FORM>
268
- </DIV>
269
-
270
- <!--
271
- <P><B>Demo #2b: OCR a PDF file using Omnipage (less than 50 pages & smaller than 10MB).</B></P>
272
- <DIV STYLE="background-color:D0D0FF; padding: 1em">
273
- <FORM ENCTYPE="multipart/form-data" METHOD="post" ACTION="upload.cgi">
274
- <P style="font-size:small;"><I>Internal key (if applicable):</I> <INPUT TYPE="password" SIZE="4" NAME="key"></P>
275
- <P>File to OCR (PDF only): <INPUT TYPE="FILE" NAME="content"></P>
276
- <br/><CENTER><INPUT TYPE="SUBMIT" VALUE="OCR this file!"></CENTER>
277
- </FORM>
278
- </DIV>
279
- -->
280
-
281
- <P><B>Demo #3: Parsing individual reference strings only (just <CODE>extract_citations</CODE>)</B></P>
282
- <DIV STYLE="background-color:D0D0FF; padding: 1em">
283
- <FORM ENCTYPE="multipart/form-data" METHOD="post" ACTION="parsCit.cgi">
284
- <INPUT TYPE="text" SIZE="80" NAME="demo" value="3" style="display:none;">
285
- <P style="font-size:small;"><I>Internal key (if applicable):</I> <INPUT TYPE="password" SIZE="4" NAME="key"></P>
286
- <P>Input Method 1) Enter a URL to a file on the web in the correct format (each line should be a separate citation; e.g., <A
287
- HREF="http://wing.comp.nus.edu.sg/~wing.nus/samples/E06-1050.cite">http://wing.comp.nus.edu.sg/~wing.nus/samples/E06-1050.cite</A> or <A
288
- HREF="http://wing.comp.nus.edu.sg/~wing.nus/samples/W06-0102.cite">http://wing.comp.nus.edu.sg/~wing.nus/samples/W06-0102.cite</A>).
289
- <INPUT TYPE="text" SIZE="80" NAME="urllines">
290
- </P>
291
-
292
- <P>Input Method 2) Upload a file (again, each line should be a separate citation)<BR/>
293
- <INPUT TYPE="FILE" NAME="datalines">
294
- </P>
295
-
296
- <P>Input Method 3) Enter a list of plain text citations (again, one per line):<BR/>
297
- <TEXTAREA ROWS="4" COLS="80" NAME="textlines">Isaac G. Councill, C. Lee Giles, Min-Yen Kan. (2008) ParsCit: An open-source CRF reference string parsing package. To appear in the proceedings of the Language Resources and Evaluation Conference (LREC 08), Marrakesh, Morrocco, May.
298
- </TEXTAREA>
299
- </P>
300
-
301
- <P>Citation export formats
302
- <INPUT TYPE=CHECKBOX NAME="ads3">ADS
303
- <INPUT TYPE=CHECKBOX NAME="bib3" CHECKED>BIB
304
- <INPUT TYPE=CHECKBOX NAME="end3">EndNote
305
- <INPUT TYPE=CHECKBOX NAME="isi3">ISI
306
- <INPUT TYPE=CHECKBOX NAME="ris3">RIS
307
- <INPUT TYPE=CHECKBOX NAME="wordbib3">WordBib
308
- </P>
309
-
310
- <br/><CENTER><INPUT TYPE="SUBMIT" VALUE="Parse these lines!"></CENTER>
311
- </FORM>
312
- </DIV>
313
-
314
- <!-- Publications ---------------------------------------------------------------------- -->
315
- <A name="p"></a><H2>Publications</H2>
316
- <P><B>Journal Papers:</B>
317
- <UL>
318
- <LI> Minh-Thang Luong, Thuy Dung Nguyen and Min-Yen Kan (forthcoming)
319
- <U>Logical Structure Recovery in Scholarly Articles with Rich
320
- Document Features</U>. Forthcoming in the International
321
- Journal of Digital Library Systems. <BR/>
322
- [ <A HREF="ijdls-SectLabel.pdf">pre-print .pdf</A> ]
323
- </UL>
324
-
325
- <P><B>International Referreed Conference Publications:</B>
326
- <UL>
327
- <LI> Isaac G. Councill, C. Lee Giles, Min-Yen Kan. (2008)
328
- <U>ParsCit: An open-source CRF reference string parsing
329
- package</U>. In Proceedings of the Language Resources and
330
- Evaluation Conference (LREC 08), Marrakesh, Morrocco, May.
331
- <BR/> [ <A HREF="lrec08/lrec08.pdf">.pdf</A> ]
332
- [ <A HREF="lrec08b.png">Poster (.png)</A> ]
333
- </UL>
334
-
335
- <P><B>Others:</B>
336
- <UL>
337
- <LI> Yong Kiat Ng. (2004) <U>Citation Parsing Using Maximum Entropy
338
- and Repairs</U>. Undergraduate thesis. National University of
339
- Singapore. <BR/>
340
- [ <A HREF="yongKiatNgThesis.pdf">.pdf</A> ]
341
- </UL>
342
-
343
- <!-- Output ---------------------------------------------------------------------- -->
344
- <A name="gsiso"></a><H2>Gold Standard Input and Sample Output</H2>
345
-
346
- <UL>
347
- <LI>Chunk tagged data for <A HREF="cora.tagged.txt">Cora</A>, <A
348
- HREF="citeseerx.tagged.txt">CiteSeer<SUP>X</SUP></A>, <A
349
- HREF="flux-cim-cs.tagged.txt">FLUX-CiM</A> and humanities (<A
350
- HREF="it-humanities.tagged.txt">Italian</A>, <A
351
- HREF="en-humanities.tagged.txt">English</A>, and <A
352
- HREF="mixed-humanities.tagged.txt">mixed language</A>) datasets
353
- (suitable for ParsCit training). For FLUX-CiM data, please try
354
- the original hosting site maintained by Eli Cortez. Credits to
355
- Matteo Romanello for contributing the humanities datasets.
356
- <LI> <A HREF="iconip.tagged.txt">Chunk tagged data for some ICONIP
357
- papers</A>. Contributed by Cheong Chi Hung.
358
- <LI>Results of running the v080917 version of ParsCit on FLUX-CiM's
359
- dataset for [ <A HREF="flux-cim-cs.out.xml">300 computer science
360
- references</A> ] [ <A HREF="flux-cim-med.out.xml">2000 medical
361
- references</A> ] [ <A HREF="cora.out.xml">on the CORA dataset</A>
362
- ]. Note that these results are considered cheating as current
363
- version has been trained on this data.
364
- <LI> Tagged section data for the SectLabel module. <BR/> [ <A
365
- HREF="sectLabelXML.tagged.txt">XML Format</A> ] [ <A
366
- HREF="sectLabel.tagged.txt">Plain Text Format</A> ]<BR/>
367
- [ <A HREF="genericSect.tagged.txt">GenericSect training data</A> ]
368
- </UL>
369
-
370
- <!-- Group Members ---------------------------------------------------------------------- -->
371
- <A name="gm"></a><H2>Group Members</H2>
372
-
373
- <UL>
374
- <LI> <A HREF="http://www.comp.nus.edu.sg/~kanmy">Min-Yen Kan</A> - Project leader, NUS
375
- <LI> <A HREF="http://www.personal.psu.edu/igc2/">Isaac G. Councill</A>, The Pennsylvania State University
376
- <LI> <A HREF="http://clgiles.ist.psu.edu/">C. Lee Giles</A>, The Pennsylvania State University
377
- <LI> <A HREF="http://wing.comp.nus.edu.sg/~lmthang">Minh-Thang Luong</A> - Research Assistant (alumnus), NUS
378
- <LI> Yong Kiat Ng - Final year undergraduate student (graduated, 2004), NUS
379
- <LI> Thuy Dung Nguyen - Research Assistant (alumnus), NUS
380
- <LI> Huy Nhat Hoang Do - Research Assistant, NUS
381
- </UL>
382
-
383
- <!-- FAQ ---------------------------------------------------------------------- -->
384
- <A name="faq"></a><H2>FAQ</H2>
385
- <DL>
386
- <DT>What platforms does ParsCit work on?</DT>
387
- <DD>ParsCit works on all major platforms: Windows, Linux and MacOS.
388
- The installation requires ruby and perl and the CRF++ embedded
389
- package also requires standard UNIX utilities like sed. You
390
- should have a working knowledge of UNIX and some experience in
391
- installing UNIX tools. Due to our time constraints, we may not be
392
- able answer your particular problems with installation. Do let us
393
- know if there was something important that you had to do to get
394
- your particular download and installation working; we'll
395
- incorporate it into the Troubleshooting section below.</DD>
396
- <DT>What is the difference of SectLabel and previous ParsHed?</DT>
397
- <DD>SectLabel is a newly-developed module that further extends
398
- ParsHed in functionality. It not only classifies header metadata,
399
- but analyzes full documents to output the logical structure of
400
- the internals of the document -- sections, subsections, figures,
401
- tables, equations, footnotes and captions. <BR/> For compatibility
402
- issues, the ParsHed module is still retained in our source code
403
- and command line options. </DD>
404
-
405
- <DT>How do I retrain ParsCit for a different language? I saw code in
406
- lib/ParsCit/PreProcess' to find the beginning of the bibliography
407
- section, and changed that but it doesn't work.</DT>
408
- <DD>The current version does not depend on those regular expressions
409
- anymore, they are for previous versions (e.g., v101101). ParsCit
410
- now first labels each line using the SectLabel module and
411
- discovers which lines to parse references for based on the first
412
- step's output. You need to retrain SectLabel for this, by
413
- providing labeled data about what class of line each line in your
414
- training data is. It's also possible to "downgrade" the current
415
- version to go back to use the rule-based method for identifying
416
- the reference section.</DD>
417
- <DT>What is the "genericHeader" in the output of SectLabel? What is
418
- the difference between "genericSect.tagged" and "SectLabel.tagged"?</DT>
419
- <DD>Generic headers, such as introduction, methodology, and
420
- evaluation, represent generic purposes of different sections in a
421
- scholarly article. We map all section names to generic ones
422
- (i.e., "5. Text Features" to "Methodology"). This promotes
423
- comparative viewing of sections with identical purpose across
424
- articles. For the second question, actually, Generic section is
425
- a component of SectLabel. It is responsible for classifying the
426
- section headers of a paper into the generic categories such as
427
- Introduction, Methodology, Result, etc. For details refer to our
428
- IJDLS journal paper.
429
- </DD>
430
- <DT>Why is there an option to input file in XML format? Which DTD
431
- should it follow?</DT>
432
- <DD>SectLabel is a robust logical document structure inference
433
- system that can handle both rich input (produced by OCR software
434
- such as font or spatial features) to boost recognition
435
- performance, but still be able to perform inference on
436
- impoverished input (plain text) with degraded
437
- performance. Currently, the XML input must be in the form of
438
- output from Nuance OmniPage (version 16)'s XML format, and hence,
439
- should follows the DTD by OmniPage. Note: The ParsCit team is not
440
- affiliated with Nuance in any way nor does it endorse
441
- OmniPage.</DD>
442
- <DT> I need to run ParsCit but I can't get well-formed text from my
443
- PDF documents. Can you help?</DT>
444
- <DD> No, we cannot help you with this. We don't perform OCR or text
445
- extraction from PDF documents. You will have to find your own
446
- source for doing the extraction or conversion. We've found
447
- Omnipage useful in our own project work (hence the possibility of
448
- XML input), but we don't endorse any product.</DD>
449
- <DT> The OmniPage XML doesn't seem to be well-formed. Is that OK?</DT>
450
- <DD> Yes. The sample "XML" provided in the links (for Demo 2) are
451
- actual outputs for a sequence of XML pages (one XML file per
452
- page). If you use OmniPage to save an XML file for input to
453
- ParsCit, make sure to save individual pages as separate files,
454
- then concatenate them to send to ParsCit. You may want to
455
- download the sample links for inspection (as they are
456
- concatenations of several XML files, your browser will likely
457
- complain about them not being well-formed.</DD>
458
- <DT> I ran Demos 1 and 2 with the default "all" settings, but sections
459
- don't seem to be detected.</DT>
460
- <DD> There's no problem. The demo just hides the SectLabel output
461
- by default. Click "Show SectLabel output" to reveal it.</DD>
462
- <DT> I ran ParsCit using the OmniPage XML output, but encountered malformed UTF8 character errors.</DT>
463
- <DD> OmniPage normally outputs XML results in UTF-16 format, a conversion into UTF-8 will solve the problem, see below: </BR>
464
- <I>&nbsp; &nbsp; &nbsp; iconv --from-code UTF-16 --to-code UTF-8 omnipageOutput.xml > newOmnipageOutput.xml</I>
465
- </DD>
466
- </DL>
467
-
468
- <!-- Troubleshooting ---------------------------------------------------------------------- -->
469
- <A name="t"></a><H2>Troubleshooting</H2>
470
-
471
- <P> A list of common problems with ParsCit. If you find problems,
472
- email the lead developer at &lt;kanmy@comp.nus.edu.sg&gt;. Please use
473
- the subject "[ParsCit]" to ensure that it reaches our attention. If
474
- you have hand-corrected tagged data that you don't mind providing us,
475
- we can use that to further improve ParsCit's extracting capabilities.
476
- Nevertheless, there are problems with the output occasionally. Below
477
- are some common problems people have encountered.
478
-
479
- <DL>
480
- <DT>ParsCit v110505 seems to be a lot slower when used on Omnipage
481
- output than the previous versions, why?</DT>
482
- <DD>You are correct. We are now using XML::Twig to do the XML
483
- processing correctly, rather than do it ad-hoc ourselves, but this
484
- requires constructing an exhaustive DOM tree for the Omnipage input.
485
- This is the timesink that you are experiencing.</DD>
486
- <DT>I'm running ParsCit on Windows but I can't get it to work, even
487
- after installing a perl interpreter. Specifically, the
488
- citeExtract.pl program dies complaining that it Can't open
489
- "/tmp/...." at line 175. </DT>
490
- <DD>ParsCit hasn't been fully tested on windows at NUS, so we can't
491
- vouch for whether it will run correctly. In this specific error
492
- case, the "/tmp/" directory (a standard place for temporary files in
493
- UNIX systems) is normally not available in Windows, and may generate
494
- problems. You may need to change the code and/or create an
495
- appropriate directory for ParsCit to generate such files.</DD>
496
- <DT>I tried downloading and running ParsCit but I get complaints
497
- about /bin/sed and crf not being found. Help?</DT>
498
- <DD>Please read the INSTALL.txt directions. You need to recompile
499
- CRF++ for your platform. The paths included with the install are
500
- for our version, you need to recompile to have the paths point
501
- correctly.</DD>
502
- <DT>When running citeExtract.pl I get some errors complaining about
503
- the wrong ELF class of the binaries. How can I fix this?</DT>
504
- <DD>This seems to be a problem with the compiled executables of
505
- CRF++ bundled with the software. Follow the INSTALL instructions
506
- but after step 1 do:
507
- <P>
508
- <CODE>$ cp -Rf * ../../.libs
509
- $ cp crf_learn ../../.libs/lt-crf_learn<BR/>
510
- $ cp crf_test ../../.libs/lt-crf_test<BR/>
511
- </CODE></DD>
512
- <DT>I'm trying to install parscit v110505 using the instructions in the install file, and when I get to the point where you're supposed to recompile CRF, it exists with an error:<BR/>
513
-
514
- <PRE>In file included from node.h:13:0,
515
- from node.cpp:9:
516
- path.h:26:52: error: 'size_t' has not been declared
517
- make[1]: *** [node.lo] Error 1
518
- make[1]: Leaving directory `/home/agarnett/parscit/crfpp/CRF++-0.51'
519
- make: *** [all] Error 2</PRE><BR/>
520
- The install file mentions that this may fail the first time; unfortunately for me, it keeps failing. any help?</DT>
521
- <DD>The error is from CRF++ package (not from ParsCit), there are two ways to fix it:<BR/>
522
- 1. Add the line. <CODE>#include&lt;iostream&gt;</CODE> in node.cpp and compile crf++ again, or;<BR/>
523
- 2. Go to <A HREF="http://crfpp.googlecode.com/svn/trunk/doc/index.html">http://crfpp.googlecode.com/svn/trunk/doc/index.html</A> and download the latest version. The instruction is the same. Hope this helps.</DD>
524
- <DT>Issue numbers don't get extracted.</DT>
525
- <DD><SPAN CLASS="red">This issue should be fixed as of the v110505
526
- release.</SPAN> There is now some heuristic postprocessing code to
527
- take care of breaking single or multiple tokens for issues and
528
- volumes. </DD>
529
- <DT>Separation of author names and publishing year fails</DT>
530
- <DD> In some reference data with non-standard sequences of
531
- first names and family names, e.g.
532
- <pre>
533
- Baltes, Paul, Ursula Staudinger, Ulmann Lindenberger (1999): Lifespan
534
- psychology: theory and application of intellectual functioning; in:
535
- Annual Review of Psychology, 50, 471-507
536
- </pre>
537
- ParsCit's post processing step may not detect and deal with these
538
- problems reliably. We're working to fix these too. </DD>
539
- <DT>I passed ParsCit plain text output but in another, non-English
540
- language. I didn't get good results or I got empty results. Can
541
- you help? </DT>
542
- <DD>Aside from English, ParsCit can handle Italian and German to a
543
- limited extent, thanks to the multilingual training data.
544
- However, the demo web interface uploads non-ASCII (e.g., UTF-8 or
545
- UTF-16 data) as binary data and fails to execute ParsCit.
546
- However, if you download a copy of ParsCit, the libraries do work
547
- on such data. Here's a <A
548
- HREF="humanities.test.out.xml">sample</A>. We'd love to help make
549
- a more universal model that can accommodate reference strings in
550
- other languages. If you're willing to help contribute ground
551
- truth data, we love to hear from you!</DD>
552
-
553
- <DT>How about retraining ParsCit for another language/domain?</DT>
554
- <DD>You can put your supervised exemplar data into the same format
555
- as tagged_references.txt found in crfpp/traindata/. Once you have
556
- this file you can generate the appropriate model for ParsCit, by
557
- using three commands (assumes you are in the crfpp/traindata
558
- directory):
559
- <P>
560
- <CODE>$ ../../bin/tr2crfpp.pl tagged_references.txt > parsCit.train.data
561
- <BR/>
562
- $ ../crf_learn parsCit.template parsCit.train.data model
563
- <BR/>
564
- $ mv model ../../resources/parsCit.model
565
- </CODE>
566
- <P>The first command creates the input feature file that crfpp uses
567
- from the training data. The second creates the model using the
568
- crf_learn command. You can then move the model file to the
569
- resources/ subdirectory where it can be utilized. To replace the
570
- default model that comes with ParsCit, just execute the final
571
- command. </DD>
572
- <DT> Can I retrain the package for a different set of tags if I
573
- change the tagset in the training data?</DT>
574
- <DD> Yes, you should be able to change the tagset to suit your
575
- dataset. You can add, eliminate and change the tagset as you
576
- wish. You need to retrain the parser system after creating your
577
- tag data. For more details on the training process, see the
578
- documentation for CRF++, that is on the web at sourceforge.
579
- </DD>
580
- <DT>When retraining I get a "bad_alloc" error. What gives?</DT>
581
- <DD>We're not entirely sure of this. CRF training is quite memory
582
- intensive and running a large amount of training data tuples may
583
- cause the embedded CRF++ package to fail. You can try with less
584
- training data, or try training on a machine with a larger amount
585
- of RAM. </DD>
586
- <DT>Does the web service actually work? I can't seem to run it.</DT>
587
- <DD>Occasionally our school's networking staff changes the firewall
588
- settings, so the port for our group's web services may be blocked
589
- (port 4000 on host wing.comp.nus.edu.sg). If you find you can't
590
- reach our services (they time out), please let us know. </DD>
591
- <DT>I get funny errors with crf_test not being useful. How do I
592
- fix this?</DT> <DD>The updated README.txt file in the 090625b
593
- distribution fixes this. Basically you need to recompile CRF++
594
- 0.51 and place the libraries and the executables in the proper
595
- place. See the README for details.</DD>
596
-
597
- </DL>
598
-
599
- <!-- Kudos ---------------------------------------------------------------------- -->
600
- <H2>Kudos</H2>
601
-
602
- <p>ParsCit owes its continued maintenance and support from its user
603
- base. Here we'd like to thank them for their help.</p>
604
-
605
- <P>Thanks to David Judd who reconfigured how CRF++ is located with
606
- respect to the main code. Thanks to Alex Garnett in spotting more
607
- problems with CRF dependencies. Thanks to George E. Raptis and Eric
608
- Tran for the port to Windows. Thanks to Zhu Ying-Bo
609
- (yumichika@163.com) from the Language Computing and Web Mining Group,
610
- Institute of Computer Science and Technology of Peking University for
611
- the partial port to Windows. Thanks to Yustus Oktian for questions
612
- about training for another language. Thanks to Madhur Kapoor for
613
- asking questions about PDF conversion. Thanks to Behrang Qasemizadeh
614
- for reporting problems with truncation of XML entities in XML output
615
- (v110505). Thanks Tim Brody for his BiblioScript patch. Thanks to
616
- David Jurgens for suggesting that remove temporary files after runs
617
- (v110505). Thanks Nikolay Nikolov for suggesting the conversion of
618
- OmniPage XML results from UTF-16 to UTF-8 to avoid encoding
619
- problems. Thanks to Matteo Romanello for the suggestion and permission
620
- to incorporate BiblioScript software (v101101). Many thanks to Kris
621
- Jack for pointing out problems with the ELF binaries and an
622
- appropriate fix. Thanks to Cheong Chi Hong for fixing problems with
623
- Preprocess.pm (v100401) and contributing the ICONIP training data and
624
- XML entity problems in reference string parsing (v100401). Thanks to
625
- Priya Venkateshan for pointing out sudo/root installation
626
- possibilities (v100401). Thanks to Mario Lipinski for reporting
627
- punctuation stripping problems in reference string parsing (v100401).
628
- Thanks to Artemy Kolchinsky for fixes in Preprocess.pm
629
- (v090625). Thanks to Matteo Romanello for the humanities training
630
- datasets. Thanks to Dain Kaplan for helping us fix the Preprocess.pm
631
- bug. Thanks to Ayeh Bandeh-Ahmadi for correcting the warning in
632
- parseRefString.pl. Thanks to Nick Friedrich and J&ouml;ran Beel of
633
- scienstein.org for all fixes in the v081201 version of ParsCit. Also
634
- thanks to Madian Khabsa for indicating problems with installation to
635
- MacOS.</p>
636
-
637
- <P>ParsCit is used by many projects worldwide, and not just in
638
- experimental, research and academic places, but in commercial
639
- snterprises as well. <A HREF="http://www.mendeley.com/">Mendeley</A>
640
- is using ParsCit to parse references from contributed papers, as is
641
- the <A HREF="http://citec.repec.org/">Citations in Economics
642
- (CitEc)</A> project.
643
-
644
-
645
- <!-- Related Links ---------------------------------------------------------------------- -->
646
- <H2>Related Links</H2>
647
-
648
- <P>Other, open-source citation parsers:
649
-
650
- <UL>
651
- <LI> <A
652
- HREF="http://freecite.library.brown.edu/welcome">FreeCite</A>:
653
- supported by the Mellon Foundation and Brown University. Written in
654
- Ruby on Rails, with the same CRF++ backend.
655
- <LI> An <A
656
- HREF="http://purl.net/net/egh/hmm-citation-extractor/">Hidden Markov
657
- Model Citation Extractor</A>: written by Erik Hetzner of the
658
- California Digital Library.
659
- </UL>
660
-
661
- <P> Other related links. Contact Min below to get your other related
662
- software listed here. Thanks!
663
-
664
- <UL>
665
- <LI> Perhaps you're interested in open source code for libraries?
666
- If so try the <A
667
- HREF="http://dewey.library.nd.edu/mailing-lists/code4lib/">CODE4LIB
668
- mailing list</A>.
669
-
670
- <LI> <A
671
- HREF="https://wiki.birncommunity.org:8443/display/NEWBIRNCC/LATISI+-+Literature+Annotation+Tool+from+the+Information+Sciences+Institute">LATISI
672
- - Literature Annotation Tool from the Information Sciences
673
- Institute</A>. A related project from ISI, using MBL instead of CRF.
674
- <LI> <A HREF="http://www.scienstein.org">Scienstein.org</A>: A
675
- recommendation system for papers.
676
- <LI> PdfBox: An open-source package for extracting text information
677
- from PDF files. Does not deal with problems with custom font
678
- encodings.
679
- </UL>
680
-
681
- <HR>
682
- <H5><ADDRESS><A HREF="http://www.comp.nus.edu.sg/~kanmy">Min-Yen Kan</A> &lt;<A HREF="mailto:kanmy@comp.nus.edu.sg">kanmy@comp.nus.edu.sg</A>&gt;</ADDRESS>
683
- Created on: Fri Dec 24 01:48:05 SGT 2004
684
- <!-- hhmts start -->
685
- | Version: 1.0
686
-
687
- | Last modified:
688
- Mon Mar 4 14:23:46 SGT 2013
689
- <!-- hhmts end -->
690
- </H5>
691
- </div>
692
- </BODY> </HTML>