anystyle-parser 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +0 -0
- data/.gitignore +5 -0
- data/.rspec +3 -0
- data/Gemfile +21 -0
- data/HISTORY.md +3 -0
- data/LICENSE +26 -0
- data/README.md +152 -0
- data/anystyle-parser.gemspec +37 -0
- data/features/step_definitions/parser_steps.rb +0 -0
- data/features/support/env.rb +1 -0
- data/lib/anystyle/parser/dictionary.rb +165 -0
- data/lib/anystyle/parser/errors.rb +19 -0
- data/lib/anystyle/parser/features.rb +164 -0
- data/lib/anystyle/parser/normalizer.rb +322 -0
- data/lib/anystyle/parser/parser.rb +240 -0
- data/lib/anystyle/parser/support/anystyle.mod +7891 -0
- data/lib/anystyle/parser/support/anystyle.pat +72 -0
- data/lib/anystyle/parser/support/dict.txt.gz +0 -0
- data/lib/anystyle/parser/utility.rb +19 -0
- data/lib/anystyle/parser/version.rb +5 -0
- data/lib/anystyle/parser.rb +17 -0
- data/spec/anystyle/parser/dictionary_spec.rb +31 -0
- data/spec/anystyle/parser/features_spec.rb +24 -0
- data/spec/anystyle/parser/normalizer_spec.rb +36 -0
- data/spec/anystyle/parser/parser_spec.rb +85 -0
- data/spec/benchmark.rb +74 -0
- data/spec/profile.rb +34 -0
- data/spec/spec_helper.rb +1 -0
- metadata +169 -0
data/.autotest
ADDED
File without changes
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
source :rubygems
|
2
|
+
gemspec
|
3
|
+
|
4
|
+
group :debug do
|
5
|
+
gem 'ruby-debug19', :require => 'ruby-debug', :platforms => [:mri_19]
|
6
|
+
gem 'ruby-debug', :platforms => [:mri_18, :jruby]
|
7
|
+
gem 'rbx-trepanning', :platforms => [:rbx]
|
8
|
+
end
|
9
|
+
|
10
|
+
group :osx_test do
|
11
|
+
gem 'autotest-fsevent', :require => false
|
12
|
+
end
|
13
|
+
|
14
|
+
group :profile do
|
15
|
+
gem 'ruby-prof'
|
16
|
+
gem 'gnuplot'
|
17
|
+
end
|
18
|
+
|
19
|
+
group :kyotocabinet do
|
20
|
+
gem 'kyotocabinet-ruby', :require => 'kyotocabinet'
|
21
|
+
end
|
data/HISTORY.md
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
Copyright 2011 Sylvester Keil. All rights reserved.
|
2
|
+
|
3
|
+
Redistribution and use in source and binary forms, with or without
|
4
|
+
modification, are permitted provided that the following conditions are met:
|
5
|
+
|
6
|
+
1. Redistributions of source code must retain the above copyright notice,
|
7
|
+
this list of conditions and the following disclaimer.
|
8
|
+
|
9
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
10
|
+
this list of conditions and the following disclaimer in the documentation
|
11
|
+
and/or other materials provided with the distribution.
|
12
|
+
|
13
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR
|
14
|
+
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
15
|
+
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
16
|
+
EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
17
|
+
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
18
|
+
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
19
|
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
20
|
+
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
21
|
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
22
|
+
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
23
|
+
|
24
|
+
The views and conclusions contained in the software and documentation are
|
25
|
+
those of the authors and should not be interpreted as representing official
|
26
|
+
policies, either expressed or implied, of the copyright holder.
|
data/README.md
ADDED
@@ -0,0 +1,152 @@
|
|
1
|
+
Anystyle-Parser
|
2
|
+
===============
|
3
|
+
|
4
|
+
Anystyle-Parser is a very fast and smart parser for academic references. It
|
5
|
+
is inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/) and
|
6
|
+
[FreeCite](http://freecite.library.brown.edu/); Anystyle-Parser is designed
|
7
|
+
for raw speed (it uses [wapiti](https://github.com/inukshuk/wapiti-ruby) based
|
8
|
+
conditional random fields and [Kyoto Cabinet](http://fallabs.com/kyotocabinet/)
|
9
|
+
as a key-value store), flexibility (it is easy to train the model with
|
10
|
+
data that is relevant to your parsing needs), and compatibility (Anystyle-Parser
|
11
|
+
exports to Ruby Hashes, BibTeX, or the CiteProc JSON format).
|
12
|
+
|
13
|
+
Installation
|
14
|
+
------------
|
15
|
+
|
16
|
+
$ [sudo] gem install anystyle-parser
|
17
|
+
|
18
|
+
During the statistical analysis of reference strings, Anystyle-Parser relies
|
19
|
+
on a large feature dictionary; by default, Anystyle-Parser creates a
|
20
|
+
[Kyoto Cabinet](http://fallabs.com/kyotocabinet/) file-based hash database
|
21
|
+
from the dictionary file that ships with the parser. If Kyoto Cabinet is
|
22
|
+
not installed on your system, Anystyle-Parser uses a simple Ruby Hash as a
|
23
|
+
fall-back; this Hash has to be re-created every time you load the parser
|
24
|
+
and takes up a lot of memory in your Ruby process; it is therefore strongly
|
25
|
+
recommended to install Kyoto Cabinet and the `kyotocabinet-ruby` gem.
|
26
|
+
|
27
|
+
$ [sudo] gem install kyotocabinet-ruby
|
28
|
+
|
29
|
+
The database file will be created the first time you access the dictionary;
|
30
|
+
note that you will need write permissions in the directory where the file
|
31
|
+
is to be created. You can change the Dictionary's default path in the
|
32
|
+
Dictrionary's options:
|
33
|
+
|
34
|
+
Anystyle::Parser::Dictionary.instance.options[:path]
|
35
|
+
|
36
|
+
|
37
|
+
Usage
|
38
|
+
-----
|
39
|
+
|
40
|
+
### Parsing
|
41
|
+
|
42
|
+
You can access the main Anystyle-Parser instance at `Anystyle.parser`;
|
43
|
+
the `#parse` method is also available via `Anystyle.parse`. For more complex
|
44
|
+
requirements (e.g., if you need multiple Parser simultaneously) you can create
|
45
|
+
your own instances from the `Anystyle::Parser::Parser` class.
|
46
|
+
|
47
|
+
The two fundamental methods you need to know about in order to use
|
48
|
+
Anystyle-Parser are `#parse` and `#train` that both accept two arguments.
|
49
|
+
|
50
|
+
Parser#parse(input, format = :hash)
|
51
|
+
Parser#train(input, truncate = false)
|
52
|
+
|
53
|
+
`#parse` parses the passed-in input (either a filename, your reference strings,
|
54
|
+
or an array of your reference strings) and returns the parsed data in the
|
55
|
+
format specified as the second argument (supported formats include: *:hash*,
|
56
|
+
*:bibtex*, and *:citeproc*).
|
57
|
+
|
58
|
+
`#train` allows you to easily train the Parser's CRF model. The first argument
|
59
|
+
is either a filename or your data as a string; the format of training data
|
60
|
+
follows the XML-like syntax of the
|
61
|
+
[CORA dataset](http://www.cs.umass.edu/~mccallum/data/cora-ie.tar.gz); the
|
62
|
+
optional boolean argument lets you decide whether to train the existing
|
63
|
+
model or to create an entirely new one.
|
64
|
+
|
65
|
+
The following irb sessions illustrates some parser goodness:
|
66
|
+
|
67
|
+
> require 'anystyle/parser'
|
68
|
+
> Anystyle.parse 'Poe, Edgar A. Essays and Reviews. New York: Library of America, 1984.'
|
69
|
+
=> [{:author=>"Poe, Edgar A.", :title=>"Essays and Reviews", :location=>"New York", :publisher=>"Library of America", :year=>1984, :type=>:book}]
|
70
|
+
> b = Anystyle.parse 'Dong C. Liu and Jorge Nocedal. 1989. On the limited memory BFGS method for large scale optimization. Mathematical Programming, 45:503–528.', :bibtex
|
71
|
+
> b[0].author[1].given
|
72
|
+
=> "Jorge"
|
73
|
+
> b[0].author.to_s
|
74
|
+
=> "Liu, Dong C. and Nocedal, Jorge"
|
75
|
+
|
76
|
+
### Unhappy with the results?
|
77
|
+
|
78
|
+
Citation references come in many forms, so, inevitably, you will find data
|
79
|
+
where Anystyle-Parser does not produce satisfying parsing results.
|
80
|
+
|
81
|
+
> Anystyle.parse 'John Lafferty, Andrew McCallum, and Fernando Pereira. 2001. Conditional random fields: probabilistic models for segmenting and labeling sequence data. In Proceedings of the International Conference on Machine Learning, pages 282-289. Morgan Kaufmann, San Francisco, CA.'
|
82
|
+
=> [{:author=>"John Lafferty and Andrew McCallum and Fernando Pereira. 2001", :title=>"Conditional random fields: probabilistic models for segmenting and labeling sequence data", :booktitle=>"Proceedings of the International Conference on Machine Learning", :pages=>"282--289", :publisher=>"Morgan Kaufmann", :location=>"San Francisco, CA", :type=>:inproceedings}]
|
83
|
+
|
84
|
+
This result is not bad, but notice how the year was not picked up as a date
|
85
|
+
but interpreted as part of the author name. If you have such a problem
|
86
|
+
(particularly, if the problem applies to a range of your input data, e.g.,
|
87
|
+
data that follows a style that Anystyle-Parser was not trained to recognize),
|
88
|
+
you can teach Anystyle-Parser to recognize your format. The easiest way to
|
89
|
+
go about this is to create new file (e.g., 'training.txt'), copy and paste a
|
90
|
+
few references, and tag them for training. For example, a tagged version of
|
91
|
+
the input from the example above would look like this:
|
92
|
+
|
93
|
+
<author> John Lafferty, Andrew McCallum, and Fernando Pereira. </author> <date> 2001. </date> <title> Conditional random fields: probabilistic models for segmenting and labeling sequence data. </title> <booktitle> In Proceedings of the International Conference on Machine Learning, </booktitle> <pages> pages 282–289. </pages> <publisher> Morgan Kaufmann, </publisher> <location> San Francisco, CA. </location>
|
94
|
+
|
95
|
+
Note that you can pick any tag names, but when working with Anystyle's model
|
96
|
+
you should use the same names used to to train the model. You can always ask
|
97
|
+
the Parser's model what names (labels) it knows about:
|
98
|
+
|
99
|
+
> Anystyle.parser.model.labels
|
100
|
+
=> ["author", "booktitle", "container", "date", "edition", "editor", "institution", "journal", "location", "note", "pages", "publisher", "tech", "title", "unknown", "volume"]
|
101
|
+
|
102
|
+
Once you have tagged a few references that you want Anystyle-Parser to learn,
|
103
|
+
you can train the model as follows:
|
104
|
+
|
105
|
+
> Anystyle.parser.train 'training.txt', false
|
106
|
+
|
107
|
+
By passing `true` as the second argument, you will discard Anystyle's default
|
108
|
+
model; the resulting model will be based entirely on your own data. By default
|
109
|
+
the new or altered model will not be saved, but you can do so at any time
|
110
|
+
by calling `Anystyle.parser.model.save` to save the model to the default file.
|
111
|
+
If you want to save the model to a different file, set the
|
112
|
+
`Anystyle.parser.model.path` attribute accordingly.
|
113
|
+
|
114
|
+
After teaching Anystyle-Parser with the tagged references, try to parse your
|
115
|
+
data again:
|
116
|
+
|
117
|
+
> Anystyle.parse 'John Lafferty, Andrew McCallum, and Fernando Pereira. 2001. Conditional random fields: probabilistic models for segmenting and labeling sequence data. In Proceedings of the International Conference on Machine Learning, pages 282-289. Morgan Kaufmann, San Francisco, CA.'
|
118
|
+
=> [{:author=>"John Lafferty and Andrew McCallum and Fernando Pereira", :title=>"Conditional random fields: probabilistic models for segmenting and labeling sequence data", :booktitle=>"Proceedings of the International Conference on Machine Learning", :pages=>"282--289", :publisher=>"Morgan Kaufmann", :location=>"San Francisco, CA", :year=>2001, :type=>:inproceedings}]
|
119
|
+
|
120
|
+
|
121
|
+
Contributing
|
122
|
+
------------
|
123
|
+
|
124
|
+
The Anystyle-Parser source code is
|
125
|
+
[hosted on GitHub](http://github.com/inukshuk/anystyle-parser/).
|
126
|
+
You can check out a copy of the latest code using Git:
|
127
|
+
|
128
|
+
$ git clone https://github.com/inukshuk/anystyle-parser.git
|
129
|
+
|
130
|
+
If you've found a bug or have a question, please open an issue on the
|
131
|
+
[Anystyle-Parser issue tracker](http://github.com/inukshuk/anystyle-parser/issues).
|
132
|
+
Or, for extra credit, clone the Anystyle-Parser repository, write a failing
|
133
|
+
example, fix the bug and submit a pull request.
|
134
|
+
|
135
|
+
|
136
|
+
License
|
137
|
+
-------
|
138
|
+
|
139
|
+
Copyright 2011 Sylvester Keil. All rights reserved.
|
140
|
+
|
141
|
+
Some of the code in Anystyle-Parser's post processing (normalizing) routines
|
142
|
+
was originally based on the source code of FreeCite and
|
143
|
+
|
144
|
+
Copyright 2008 Public Display Inc.
|
145
|
+
|
146
|
+
The CRF template is a modified version of ParsCit's original template
|
147
|
+
|
148
|
+
Copyright 2008, 2009, 2010, 2011 Min-Yen Kan,
|
149
|
+
Isaac G. Councill, C. Lee Giles, Minh-Thang Luong and Huy Nhat Hoang
|
150
|
+
Do.
|
151
|
+
|
152
|
+
Anystyle-Parser is distributed under a BSD-style license. See LICENSE for details.
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib/', __FILE__)
|
3
|
+
$:.unshift lib unless $:.include?(lib)
|
4
|
+
|
5
|
+
require 'anystyle/parser/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |s|
|
8
|
+
s.name = 'anystyle-parser'
|
9
|
+
s.version = Anystyle::Parser::VERSION.dup
|
10
|
+
s.platform = Gem::Platform::RUBY
|
11
|
+
s.authors = ['Sylvester Keil']
|
12
|
+
s.email = ['http://sylvester.keil.or.at']
|
13
|
+
s.homepage = 'http://inukshuk.github.com/anystyle-parser'
|
14
|
+
s.summary = 'Parser for academic references.'
|
15
|
+
s.description = 'A sophisticated parser for academic references based on conditional random fields.'
|
16
|
+
s.license = 'FreeBSD'
|
17
|
+
|
18
|
+
s.add_runtime_dependency('bibtex-ruby', '~>1.3')
|
19
|
+
s.add_runtime_dependency('wapiti', '~>0.0')
|
20
|
+
|
21
|
+
s.add_development_dependency('rake', ['~>0.9'])
|
22
|
+
s.add_development_dependency('racc', ['~>1.4'])
|
23
|
+
s.add_development_dependency('cucumber', ['~>1.0'])
|
24
|
+
s.add_development_dependency('rspec', ['~>2.6'])
|
25
|
+
s.add_development_dependency('ZenTest', ['~>4.6'])
|
26
|
+
|
27
|
+
s.files = `git ls-files`.split("\n") - Dir['resources/**/*']
|
28
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
29
|
+
s.executables = []
|
30
|
+
s.require_path = 'lib'
|
31
|
+
|
32
|
+
s.rdoc_options = %w{--line-numbers --inline-source --title "Anystyle\ Parser" --main README.md}
|
33
|
+
s.extra_rdoc_files = %w{README.md LICENSE}
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
# vim: syntax=ruby
|
File without changes
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'anystyle/parser'
|
@@ -0,0 +1,165 @@
|
|
1
|
+
module Anystyle
|
2
|
+
module Parser
|
3
|
+
|
4
|
+
# Dictionary is a Singleton object that provides a key-value store of
|
5
|
+
# the Anystyle Parser dictionary required for feature elicitation.
|
6
|
+
# This dictionary acts essentially like a Ruby Hash object, but because
|
7
|
+
# of the dictionary's size it is not efficient to keep the entire
|
8
|
+
# dictionary in memory at all times. For that reason, Dictionary
|
9
|
+
# creates a persistent data store on disk using Kyoto Cabinet; if
|
10
|
+
# Kyoto Cabinet is not installed a Ruby Hash is used as a fall-back.
|
11
|
+
#
|
12
|
+
# The database will be automatically created from the dictionary file
|
13
|
+
# using the best available DBM the first time it is accessed. Once
|
14
|
+
# database file exists, the database will be restored from file.
|
15
|
+
# Therefore, if you make changes to the dictionary file, you will have
|
16
|
+
# to delete the old database file for a new one to be created.
|
17
|
+
#
|
18
|
+
# Database creation requires write permissions. By default, the database
|
19
|
+
# will be created in the support directory of the Parser; if you have
|
20
|
+
# installed the gem version of the Parser, you may not have write
|
21
|
+
# permissions, but you can change the path in the Dictionary's options.
|
22
|
+
#
|
23
|
+
# Dictionary.instance.options[:path] # => the database file
|
24
|
+
# Dictionary.instance.options[:source] # => the (zipped) dictionary file
|
25
|
+
#
|
26
|
+
class Dictionary
|
27
|
+
|
28
|
+
include Singleton
|
29
|
+
|
30
|
+
@defaults = {
|
31
|
+
:source => File.expand_path('../support/dict.txt.gz', __FILE__),
|
32
|
+
:path => File.expand_path('../support/dict.kch', __FILE__)
|
33
|
+
}.freeze
|
34
|
+
|
35
|
+
@keys = [:male, :female, :surname, :month, :place, :publisher, :journal].freeze
|
36
|
+
|
37
|
+
@code = Hash[*@keys.zip(0.upto(@keys.length-1).map { |i| 2**i }).flatten]
|
38
|
+
@code.default = 0
|
39
|
+
@code.freeze
|
40
|
+
|
41
|
+
@mode = begin
|
42
|
+
require 'kyotocabinet'
|
43
|
+
:kyoto
|
44
|
+
rescue LoadError
|
45
|
+
:hash
|
46
|
+
end
|
47
|
+
|
48
|
+
class << self
|
49
|
+
|
50
|
+
attr_reader :keys, :code, :defaults, :mode
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
attr_reader :options
|
55
|
+
|
56
|
+
def initialize
|
57
|
+
@options = Dictionary.defaults.dup
|
58
|
+
end
|
59
|
+
|
60
|
+
def [](key)
|
61
|
+
db[key.to_s].to_i
|
62
|
+
end
|
63
|
+
|
64
|
+
def []=(key, value)
|
65
|
+
db[key.to_s] = value
|
66
|
+
end
|
67
|
+
|
68
|
+
def create
|
69
|
+
case Dictionary.mode
|
70
|
+
when :kyoto
|
71
|
+
truncate
|
72
|
+
@db = KyotoCabinet::DB.new
|
73
|
+
unless @db.open(path, KyotoCabinet::DB::OWRITER | KyotoCabinet::DB::OCREATE)
|
74
|
+
raise DatabaseError, "failed to create cabinet file #{path}: #{@db.error}"
|
75
|
+
end
|
76
|
+
populate
|
77
|
+
close
|
78
|
+
else
|
79
|
+
# nothing
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def truncate
|
84
|
+
close
|
85
|
+
File.unlink(path) if File.exists?(path)
|
86
|
+
end
|
87
|
+
|
88
|
+
def open
|
89
|
+
create unless File.exists?(path)
|
90
|
+
|
91
|
+
case Dictionary.mode
|
92
|
+
when :kyoto
|
93
|
+
at_exit { ::Anystyle::Parser::Dictionary.instance.close }
|
94
|
+
|
95
|
+
@db = KyotoCabinet::DB.new
|
96
|
+
unless @db.open(path, KyotoCabinet::DB::OREADER)
|
97
|
+
raise DictionaryError, "failed to open cabinet file #{path}: #{@db.error}"
|
98
|
+
end
|
99
|
+
else
|
100
|
+
@db = Hash.new(0)
|
101
|
+
populate
|
102
|
+
end
|
103
|
+
|
104
|
+
@db
|
105
|
+
end
|
106
|
+
|
107
|
+
def open?; !!@db; end
|
108
|
+
|
109
|
+
def close
|
110
|
+
@db.close if @db.respond_to?(:close)
|
111
|
+
@db = nil
|
112
|
+
end
|
113
|
+
|
114
|
+
def path
|
115
|
+
options[:path]
|
116
|
+
end
|
117
|
+
|
118
|
+
private
|
119
|
+
|
120
|
+
def db
|
121
|
+
@db || open
|
122
|
+
end
|
123
|
+
|
124
|
+
def populate
|
125
|
+
require 'zlib'
|
126
|
+
|
127
|
+
File.open(options[:source], 'r:UTF-8') do |f|
|
128
|
+
mode = 0
|
129
|
+
|
130
|
+
Zlib::GzipReader.new(f).each do |line|
|
131
|
+
line.strip!
|
132
|
+
|
133
|
+
if line.start_with?('#')
|
134
|
+
case line
|
135
|
+
when /^## male/i
|
136
|
+
mode = Dictionary.code[:male]
|
137
|
+
when /^## female/i
|
138
|
+
mode = Dictionary.code[:female]
|
139
|
+
when /^## (?:surname|last|chinese)/i
|
140
|
+
mode = Dictionary.code[:surname]
|
141
|
+
when /^## months/i
|
142
|
+
mode = Dictionary.code[:month]
|
143
|
+
when /^## place/i
|
144
|
+
mode = Dictionary.code[:place]
|
145
|
+
when /^## publisher/i
|
146
|
+
mode = Dictionary.code[:publisher]
|
147
|
+
when /^## journal/i
|
148
|
+
mode = Dictionary.code[:journal]
|
149
|
+
else
|
150
|
+
# skip comments
|
151
|
+
end
|
152
|
+
else
|
153
|
+
key, probability = line.split(/\s+(\d+\.\d+)\s*$/)
|
154
|
+
value = self[key]
|
155
|
+
self[key] = value + mode if value < mode
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Anystyle
|
2
|
+
module Parser
|
3
|
+
|
4
|
+
class Error < StandardError
|
5
|
+
|
6
|
+
attr_accessor :original
|
7
|
+
|
8
|
+
def initialize(message = nil, original = $!)
|
9
|
+
super(message)
|
10
|
+
@original = original
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
class DictionaryError < Error; end
|
16
|
+
class TrainingError < Error; end
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,164 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
module Anystyle
|
4
|
+
module Parser
|
5
|
+
|
6
|
+
class Feature
|
7
|
+
|
8
|
+
@dict = Dictionary.instance
|
9
|
+
@instances = []
|
10
|
+
|
11
|
+
class << self
|
12
|
+
|
13
|
+
attr_reader :dict, :instances
|
14
|
+
|
15
|
+
def define(name, &block)
|
16
|
+
instances << new(name, block)
|
17
|
+
end
|
18
|
+
|
19
|
+
def undefine(name)
|
20
|
+
instances.reject! { |f| f.name == name }
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_accessor :name, :matcher
|
26
|
+
|
27
|
+
def initialize(name, matcher)
|
28
|
+
@name, @matcher = name, matcher
|
29
|
+
end
|
30
|
+
|
31
|
+
def match(*arguments)
|
32
|
+
matcher.call(*arguments)
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
# Is the the last character upper-/lowercase, numeric or something else?
|
39
|
+
# Returns A, a, 0 or the last character itself.
|
40
|
+
Feature.define :last_character do |token, stripped, sequence, offset|
|
41
|
+
case char = token.split(//)[-1]
|
42
|
+
when /^[[:upper:]]$/
|
43
|
+
:upper
|
44
|
+
when /^[[:lower:]]$/
|
45
|
+
:lower
|
46
|
+
when /^\d$/
|
47
|
+
:numeric
|
48
|
+
else
|
49
|
+
char
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Sequences of the first four characters
|
54
|
+
Feature.define :first do |token, stripped, sequence, offset|
|
55
|
+
c = token.split(//)[0,4]
|
56
|
+
(0..3).map { |i| c[0..i].join }
|
57
|
+
end
|
58
|
+
|
59
|
+
# Sequences of the last four characters
|
60
|
+
Feature.define :last do |token, stripped, sequence, offset|
|
61
|
+
c = token.split(//).reverse[0,4]
|
62
|
+
(0..3).map { |i| c[0..i].reverse.join }
|
63
|
+
end
|
64
|
+
|
65
|
+
Feature.define :stripped_lowercase do |token, stripped, sequence, offset|
|
66
|
+
stripped.empty? ? :EMPTY : stripped.downcase
|
67
|
+
end
|
68
|
+
|
69
|
+
Feature.define :capitalization do |token, stripped, sequence, offset|
|
70
|
+
case stripped
|
71
|
+
when /^[[:upper:]]$/
|
72
|
+
:single
|
73
|
+
when /^[[:upper:]][[:lower:]]/
|
74
|
+
:initial
|
75
|
+
when /^[[:upper:]]+$/
|
76
|
+
:all
|
77
|
+
else
|
78
|
+
:other
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
Feature.define :numbers do |token, stripped, sequence, offset|
|
83
|
+
case token
|
84
|
+
when /\d\(\d+(-\d+)?\)/
|
85
|
+
:volume
|
86
|
+
when /^\(\d{4}\)\W*$/, /^(1\d{3}|20\d{2})[\.,;:]?$/
|
87
|
+
:year
|
88
|
+
when /\d+\s*--?\s*\d+/, /^\W*pp?\.\d*\W*$/
|
89
|
+
:page
|
90
|
+
when /^\d$/
|
91
|
+
:single
|
92
|
+
when /^\d{2}$/
|
93
|
+
:double
|
94
|
+
when /^\d{3}$/
|
95
|
+
:triple
|
96
|
+
when /^\d+$/
|
97
|
+
:digits
|
98
|
+
when /\d+(th|st|nd|rd)\W*/i
|
99
|
+
:ordinal
|
100
|
+
when /\d/
|
101
|
+
:numeric
|
102
|
+
else
|
103
|
+
:none
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
Feature.define :dictionary do |token, stripped, sequence, offset|
|
108
|
+
c = Feature.dict[stripped.downcase]
|
109
|
+
f = Dictionary.keys.map do |k|
|
110
|
+
c & Dictionary.code[k] > 0 ? k : ['no',k].join('-').to_sym
|
111
|
+
end
|
112
|
+
f.unshift(c)
|
113
|
+
end
|
114
|
+
|
115
|
+
# TODO sequence features should be called just once per sequence
|
116
|
+
# TODO improve / disambiguate edition
|
117
|
+
Feature.define :editors do |token, stripped, sequence, offest|
|
118
|
+
sequence.any? { |t| t =~ /^(ed|editor|editors|eds|edited)$/i } ? :editors : :'no-editors'
|
119
|
+
end
|
120
|
+
|
121
|
+
# TODO Translated
|
122
|
+
|
123
|
+
Feature.define :location do |token, stripped, sequence, offset|
|
124
|
+
((offset.to_f / sequence.length) * 10).round
|
125
|
+
end
|
126
|
+
|
127
|
+
Feature.define :punctuation do |token, stripped, sequence, offset|
|
128
|
+
case token
|
129
|
+
when /^["'”’´‘“`]/
|
130
|
+
:quote
|
131
|
+
when /["'”’´‘“`]$/
|
132
|
+
:unquote
|
133
|
+
when /-+/
|
134
|
+
:hyphen
|
135
|
+
when /[,;:-]$/
|
136
|
+
:internal
|
137
|
+
when /[!\?\."']$/
|
138
|
+
:terminal
|
139
|
+
when /^[\(\[\{<].*[>\}\]\)].?$/
|
140
|
+
:braces
|
141
|
+
when /^\d{2,5}\(\d{2,5}\).?$/
|
142
|
+
:volume
|
143
|
+
else
|
144
|
+
:others
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
|
149
|
+
Feature.define :type do |token, stripped, sequence, offset|
|
150
|
+
s = sequence.join(' ')
|
151
|
+
case
|
152
|
+
when s =~ /dissertation abstract/i
|
153
|
+
:dissertaion
|
154
|
+
when s =~ /proceeding/i
|
155
|
+
:proceedings
|
156
|
+
when stripped =~ /^in$/i && sequence[offset+1].to_s =~ /^[[:upper:]]/ && sequence[offset-1].to_s =~ /["'”’´‘“`\.;,]$/
|
157
|
+
:collection
|
158
|
+
else
|
159
|
+
:other
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|
164
|
+
end
|