anystyle-parser 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +28 -12
- data/HISTORY.md +6 -0
- data/LICENSE +2 -2
- data/README.md +11 -11
- data/Rakefile +14 -3
- data/anystyle-parser.gemspec +13 -8
- data/features/support/env.rb +18 -0
- data/lib/anystyle/parser/dictionary.rb +35 -37
- data/lib/anystyle/parser/errors.rb +18 -18
- data/lib/anystyle/parser/parser.rb +254 -244
- data/lib/anystyle/parser/utility.rb +18 -18
- data/lib/anystyle/parser/version.rb +1 -1
- data/spec/anystyle/parser/parser_spec.rb +119 -115
- data/spec/spec_helper.rb +9 -2
- metadata +26 -43
- data/.autotest +0 -0
- data/.gitignore +0 -5
- data/.rspec +0 -3
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 9723d7ea8fd46588c2487a72357c29fc9b1811ce
|
4
|
+
data.tar.gz: bfe339c9e7ab8883cbef7d8ad4de1a6aba433b53
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 18591aa6d5ab49057b57308ff4a38b0e99aa07b45acf244998b47e51653b892ca442a5def2f3ed547f74c4cf338355070749cd31ca44dec08d41d21b2f23912b
|
7
|
+
data.tar.gz: 2790d3bd5f4fa9a86aae3be03b3dbb9847816911a173186c898def47f28c243f3bf12ccdedb34e914bd5549f3bdaeed24fcc47c014b628f71b96dacc0d39846e
|
data/Gemfile
CHANGED
@@ -1,19 +1,19 @@
|
|
1
1
|
source 'https://rubygems.org'
|
2
2
|
gemspec
|
3
3
|
|
4
|
-
group :development do
|
5
|
-
|
6
|
-
|
7
|
-
|
4
|
+
group :development, :test do
|
5
|
+
gem 'rake'
|
6
|
+
gem 'cucumber'
|
7
|
+
gem 'rspec'
|
8
|
+
gem 'simplecov', '~>0.8', :require => false
|
9
|
+
gem 'rubinius-coverage', :platform => :rbx
|
10
|
+
gem 'coveralls', :require => false
|
8
11
|
end
|
9
12
|
|
10
|
-
group :
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
gem 'cucumber'
|
15
|
-
gem 'rspec'
|
16
|
-
gem 'ZenTest'
|
13
|
+
group :debug do
|
14
|
+
gem 'debugger', '~>1.6', :require => false, :platform => :mri
|
15
|
+
gem 'rubinius-compiler', '~>2.0', :require => false, :platform => :rbx
|
16
|
+
gem 'rubinius-debugger', '~>2.0', :require => false, :platform => :rbx
|
17
17
|
end
|
18
18
|
|
19
19
|
group :profile do
|
@@ -22,6 +22,22 @@ group :profile do
|
|
22
22
|
end
|
23
23
|
|
24
24
|
group :extra do
|
25
|
+
gem 'autotest-fsevent', :require => false
|
26
|
+
gem 'yard'
|
27
|
+
gem 'ZenTest'
|
28
|
+
end
|
29
|
+
|
30
|
+
group :redis do
|
31
|
+
gem 'redis'
|
32
|
+
gem 'hiredis'
|
33
|
+
end
|
34
|
+
|
35
|
+
group :kyoto do
|
25
36
|
gem 'kyotocabinet-ruby', :require => 'kyotocabinet'
|
26
|
-
|
37
|
+
end
|
38
|
+
|
39
|
+
platform :rbx do
|
40
|
+
gem 'rubysl', '~>2.0'
|
41
|
+
gem 'json', '~>1.8'
|
42
|
+
gem 'racc'
|
27
43
|
end
|
data/HISTORY.md
CHANGED
data/LICENSE
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
Copyright 2011-
|
1
|
+
Copyright 2011-2014 Sylvester Keil. All rights reserved.
|
2
2
|
|
3
3
|
Redistribution and use in source and binary forms, with or without
|
4
4
|
modification, are permitted provided that the following conditions are met:
|
@@ -23,4 +23,4 @@ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
23
23
|
|
24
24
|
The views and conclusions contained in the software and documentation are
|
25
25
|
those of the authors and should not be interpreted as representing official
|
26
|
-
policies, either expressed or implied, of the copyright holder.
|
26
|
+
policies, either expressed or implied, of the copyright holder.
|
data/README.md
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
Anystyle-Parser
|
2
2
|
===============
|
3
|
+
[![Build Status](https://travis-ci.org/inukshuk/anystyle-parser.png?branch=master)](https://travis-ci.org/inukshuk/anystyle-parser)
|
4
|
+
[![Coverage Status](https://coveralls.io/repos/inukshuk/anystyle-parser/badge.png)](https://coveralls.io/r/inukshuk/anystyle-parser)
|
3
5
|
|
4
6
|
Anystyle-Parser is a very fast and smart parser for academic references. It
|
5
7
|
is inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/) and
|
@@ -18,7 +20,7 @@ Installation
|
|
18
20
|
$ [sudo] gem install anystyle-parser
|
19
21
|
|
20
22
|
During the statistical analysis of reference strings, Anystyle-Parser relies
|
21
|
-
on a large feature dictionary; by default, Anystyle-Parser creates a
|
23
|
+
on a large feature dictionary; by default, Anystyle-Parser creates a
|
22
24
|
[Kyoto Cabinet](http://fallabs.com/kyotocabinet/) file-based hash database
|
23
25
|
from the dictionary file that ships with the parser. If Kyoto Cabinet is
|
24
26
|
not installed on your system, Anystyle-Parser uses a simple Ruby Hash as a
|
@@ -26,7 +28,7 @@ fall-back; this Hash has to be re-created every time you load the parser
|
|
26
28
|
and takes up a lot of memory in your Ruby process; it is therefore strongly
|
27
29
|
recommended to install Kyoto Cabinet and the `kyotocabinet-ruby` gem.
|
28
30
|
|
29
|
-
$ [sudo] gem install kyotocabinet-ruby
|
31
|
+
$ [sudo] gem install kyotocabinet-ruby
|
30
32
|
|
31
33
|
The database file will be created the first time you access the dictionary;
|
32
34
|
note that you will need write permissions in the directory where the file
|
@@ -39,8 +41,8 @@ Starting with version 0.1.0, Anystyle-Parser also supports
|
|
39
41
|
[Redis](http://redis.io); to use Redis as the data store you need to install
|
40
42
|
the `redis` gem (and, optionally, the `hiredis` gem).
|
41
43
|
|
42
|
-
$ [sudo] gem install hiredis
|
43
|
-
$ [sudo] gem install redis
|
44
|
+
$ [sudo] gem install hiredis
|
45
|
+
$ [sudo] gem install redis
|
44
46
|
|
45
47
|
To see which data store modes are available in you current environment,
|
46
48
|
check the output of `Dictionary.modes`:
|
@@ -52,7 +54,7 @@ To select one of the available modes, use the dictionary instance options:
|
|
52
54
|
|
53
55
|
> Anystyle::Parser::Dictionary.instance.options[:mode]
|
54
56
|
=> :kyoto
|
55
|
-
|
57
|
+
|
56
58
|
To use [Redis](http://redis.io) you also need to set the host or unix socket
|
57
59
|
where your redis server is available. For example:
|
58
60
|
|
@@ -84,7 +86,7 @@ Anystyle-Parser are `#parse` and `#train` that both accept two arguments.
|
|
84
86
|
`#parse` parses the passed-in input (either a filename, your reference strings,
|
85
87
|
or an array of your reference strings) and returns the parsed data in the
|
86
88
|
format specified as the second argument (supported formats include: *:hash*,
|
87
|
-
*:bibtex*, *:citeproc*, and *:
|
89
|
+
*:bibtex*, *:citeproc*, *:tags*, and *:raw*).
|
88
90
|
|
89
91
|
`#train` allows you to easily train the Parser's CRF model. The first argument
|
90
92
|
is either a filename or your data as a string; the format of training data
|
@@ -113,7 +115,7 @@ The following irb sessions illustrates some parser goodness:
|
|
113
115
|
year = {1997}
|
114
116
|
}
|
115
117
|
=> nil
|
116
|
-
|
118
|
+
|
117
119
|
### Unhappy with the results?
|
118
120
|
|
119
121
|
Citation references come in many forms, so, inevitably, you will find data
|
@@ -159,11 +161,10 @@ data again:
|
|
159
161
|
=> [{:author=>"John Lafferty and Andrew McCallum and Fernando Pereira", :title=>"Conditional random fields: probabilistic models for segmenting and labeling sequence data", :booktitle=>"Proceedings of the International Conference on Machine Learning", :pages=>"282--289", :publisher=>"Morgan Kaufmann", :location=>"San Francisco, CA", :year=>2001, :type=>:inproceedings}]
|
160
162
|
|
161
163
|
If you want to make Anystyle-Parser smarter, please consider sending us your
|
162
|
-
tagged references (see below).
|
164
|
+
tagged references (see below).
|
163
165
|
|
164
166
|
Contributing
|
165
167
|
------------
|
166
|
-
|
167
168
|
The Anystyle-Parser source code is
|
168
169
|
[hosted on GitHub](http://github.com/inukshuk/anystyle-parser/).
|
169
170
|
You can check out a copy of the latest code using Git:
|
@@ -182,8 +183,7 @@ and open a pull request on GitHub.
|
|
182
183
|
|
183
184
|
License
|
184
185
|
-------
|
185
|
-
|
186
|
-
Copyright 2011-2012 Sylvester Keil. All rights reserved.
|
186
|
+
Copyright 2011-2014 Sylvester Keil. All rights reserved.
|
187
187
|
|
188
188
|
Some of the code in Anystyle-Parser's post processing (normalizing) routines
|
189
189
|
was originally based on the source code of FreeCite and
|
data/Rakefile
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'bundler'
|
2
2
|
begin
|
3
|
-
Bundler.setup
|
3
|
+
Bundler.setup
|
4
4
|
rescue Bundler::BundlerError => e
|
5
5
|
$stderr.puts e.message
|
6
6
|
$stderr.puts "Run `bundle install` to install missing gems"
|
@@ -23,6 +23,13 @@ task :release => [:build] do
|
|
23
23
|
system "gem push anystyle-parser-#{Anystyle::Parser::VERSION}.gem"
|
24
24
|
end
|
25
25
|
|
26
|
+
task :check_warnings do
|
27
|
+
$VERBOSE = true
|
28
|
+
require 'anystyle/parser'
|
29
|
+
|
30
|
+
puts Anystyle::Parser::VERSION
|
31
|
+
end
|
32
|
+
|
26
33
|
require 'rspec/core'
|
27
34
|
require 'rspec/core/rake_task'
|
28
35
|
RSpec::Core::RakeTask.new(:spec) do |spec|
|
@@ -32,6 +39,10 @@ end
|
|
32
39
|
require 'cucumber/rake/task'
|
33
40
|
Cucumber::Rake::Task.new(:features)
|
34
41
|
|
42
|
+
require 'coveralls/rake/task'
|
43
|
+
Coveralls::RakeTask.new
|
44
|
+
task :test_with_coveralls => [:spec, 'coveralls:push']
|
45
|
+
|
35
46
|
task :default => :spec
|
36
47
|
|
37
48
|
begin
|
@@ -41,13 +52,13 @@ rescue LoadError
|
|
41
52
|
# ignore
|
42
53
|
end
|
43
54
|
|
44
|
-
desc 'Run an IRB session with
|
55
|
+
desc 'Run an IRB session with Anystyle-Parser loaded'
|
45
56
|
task :console, [:script] do |t, args|
|
46
57
|
ARGV.clear
|
47
58
|
|
48
59
|
require 'irb'
|
49
60
|
require 'anystyle/parser'
|
50
|
-
|
61
|
+
|
51
62
|
IRB.conf[:SCRIPT] = args.script
|
52
63
|
IRB.start
|
53
64
|
end
|
data/anystyle-parser.gemspec
CHANGED
@@ -11,22 +11,27 @@ Gem::Specification.new do |s|
|
|
11
11
|
s.authors = ['Sylvester Keil']
|
12
12
|
s.email = ['http://sylvester.keil.or.at']
|
13
13
|
s.homepage = 'http://github.com/inukshuk/anystyle-parser'
|
14
|
-
s.summary = '
|
15
|
-
s.description = 'A sophisticated parser for academic
|
14
|
+
s.summary = 'Smart and fast academic bibliography parser.'
|
15
|
+
s.description = 'A sophisticated parser for academic reference lists and bibliographies based on machine learning algorithms using conditional random fields.'
|
16
16
|
s.license = 'FreeBSD'
|
17
|
-
|
18
|
-
s.
|
17
|
+
|
18
|
+
s.required_ruby_version = '>= 1.9.3'
|
19
|
+
|
20
|
+
s.add_runtime_dependency('bibtex-ruby', '~>3.0')
|
19
21
|
s.add_runtime_dependency('wapiti', '~>0.0')
|
20
|
-
s.add_runtime_dependency('namae', '~>0.
|
22
|
+
s.add_runtime_dependency('namae', '~>0.8')
|
23
|
+
|
24
|
+
s.files = `git ls-files`.split("\n").reject { |path|
|
25
|
+
path.start_with?('.')
|
26
|
+
} - Dir['resources/**/*']
|
21
27
|
|
22
|
-
s.files = `git ls-files`.split("\n") - Dir['resources/**/*']
|
23
28
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
24
29
|
s.executables = []
|
25
30
|
s.require_path = 'lib'
|
26
31
|
|
27
32
|
s.rdoc_options = %w{--line-numbers --inline-source --title "Anystyle\ Parser" --main README.md}
|
28
33
|
s.extra_rdoc_files = %w{README.md LICENSE}
|
29
|
-
|
34
|
+
|
30
35
|
end
|
31
36
|
|
32
|
-
# vim: syntax=ruby
|
37
|
+
# vim: syntax=ruby
|
data/features/support/env.rb
CHANGED
@@ -1 +1,19 @@
|
|
1
|
+
begin
|
2
|
+
require 'simplecov'
|
3
|
+
require 'coveralls' if ENV['CI']
|
4
|
+
rescue LoadError
|
5
|
+
# ignore
|
6
|
+
end
|
7
|
+
|
8
|
+
begin
|
9
|
+
case
|
10
|
+
when defined?(RUBY_ENGINE) && RUBY_ENGINE == 'rbx'
|
11
|
+
require 'rubinius/debugger'
|
12
|
+
else
|
13
|
+
require 'debugger'
|
14
|
+
end
|
15
|
+
rescue LoadError
|
16
|
+
# ignore
|
17
|
+
end
|
18
|
+
|
1
19
|
require 'anystyle/parser'
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Anystyle
|
2
2
|
module Parser
|
3
|
-
|
3
|
+
|
4
4
|
# Dictionary is a Singleton object that provides a key-value store of
|
5
5
|
# the Anystyle Parser dictionary required for feature elicitation.
|
6
6
|
# This dictionary acts essentially like a Ruby Hash object, but because
|
@@ -11,8 +11,8 @@ module Anystyle
|
|
11
11
|
#
|
12
12
|
# Starting with version 0.1.0 Redis support was added. If you would
|
13
13
|
# like to use Redis as the dictionary data store you can do so by
|
14
|
-
#
|
15
|
-
#
|
14
|
+
# installing `redis' gem (and optionally the `hiredis' gem).
|
15
|
+
#
|
16
16
|
# The database will be automatically created from the dictionary file
|
17
17
|
# using the best available DBM the first time it is accessed. Once
|
18
18
|
# database file exists, the database will be restored from file.
|
@@ -46,7 +46,7 @@ module Anystyle
|
|
46
46
|
class Dictionary
|
47
47
|
|
48
48
|
include Singleton
|
49
|
-
|
49
|
+
|
50
50
|
@keys = [:male, :female, :surname, :month, :place, :publisher, :journal].freeze
|
51
51
|
|
52
52
|
@code = Hash[*@keys.zip(0.upto(@keys.length-1).map { |i| 2**i }).flatten]
|
@@ -67,42 +67,40 @@ module Anystyle
|
|
67
67
|
rescue LoadError
|
68
68
|
# info 'no redis support detected'
|
69
69
|
end
|
70
|
-
|
70
|
+
|
71
71
|
begin
|
72
72
|
require 'kyotocabinet'
|
73
73
|
@modes.unshift :kyoto
|
74
74
|
rescue LoadError
|
75
75
|
# info 'no kyoto-cabinet support detected'
|
76
76
|
end
|
77
|
-
|
77
|
+
|
78
78
|
@defaults = {
|
79
79
|
:mode => @modes[0],
|
80
80
|
:source => File.expand_path('../support/dict.txt.gz', __FILE__),
|
81
81
|
:cabinet => File.expand_path('../support/dict.kch', __FILE__),
|
82
82
|
:port => 6379
|
83
83
|
}.freeze
|
84
|
-
|
85
|
-
|
84
|
+
|
85
|
+
|
86
86
|
class << self
|
87
|
-
|
88
87
|
attr_reader :keys, :code, :defaults, :modes
|
89
|
-
|
90
88
|
end
|
91
89
|
|
92
90
|
attr_reader :options
|
93
|
-
|
91
|
+
|
94
92
|
def initialize
|
95
93
|
@options = Dictionary.defaults.dup
|
96
94
|
end
|
97
|
-
|
95
|
+
|
98
96
|
def [](key)
|
99
97
|
db[key.to_s].to_i
|
100
98
|
end
|
101
|
-
|
99
|
+
|
102
100
|
def []=(key, value)
|
103
101
|
db[key.to_s] = value
|
104
102
|
end
|
105
|
-
|
103
|
+
|
106
104
|
def create
|
107
105
|
case options[:mode]
|
108
106
|
when :kyoto
|
@@ -113,61 +111,61 @@ module Anystyle
|
|
113
111
|
end
|
114
112
|
populate
|
115
113
|
close
|
116
|
-
|
114
|
+
|
117
115
|
when :redis
|
118
116
|
@db ||= Redis.new(options)
|
119
117
|
populate
|
120
118
|
close
|
121
|
-
|
119
|
+
|
122
120
|
else
|
123
121
|
# nothing
|
124
122
|
end
|
125
123
|
end
|
126
|
-
|
124
|
+
|
127
125
|
def truncate
|
128
126
|
close
|
129
|
-
File.unlink(path) if File.exists?(path)
|
127
|
+
File.unlink(path) if File.exists?(path)
|
130
128
|
end
|
131
|
-
|
129
|
+
|
132
130
|
def open
|
133
131
|
case options[:mode]
|
134
132
|
when :kyoto
|
135
133
|
at_exit { ::Anystyle::Parser::Dictionary.instance.close }
|
136
134
|
|
137
135
|
create unless File.exists?(path)
|
138
|
-
|
136
|
+
|
139
137
|
@db = KyotoCabinet::DB.new
|
140
138
|
unless @db.open(path, KyotoCabinet::DB::OREADER)
|
141
139
|
raise DictionaryError, "failed to open cabinet file #{path}: #{@db.error}"
|
142
140
|
end
|
143
|
-
|
141
|
+
|
144
142
|
when :redis
|
145
143
|
at_exit { ::Anystyle::Parser::Dictionary.instance.close }
|
146
144
|
@db = Redis.new(options)
|
147
|
-
|
145
|
+
|
148
146
|
populate if @db.dbsize.zero?
|
149
|
-
|
147
|
+
|
150
148
|
else
|
151
149
|
@db = Hash.new(0)
|
152
150
|
populate
|
153
151
|
end
|
154
|
-
|
152
|
+
|
155
153
|
@db
|
156
154
|
end
|
157
|
-
|
155
|
+
|
158
156
|
def open?; !!@db; end
|
159
|
-
|
157
|
+
|
160
158
|
def close
|
161
159
|
case
|
162
160
|
when @db.respond_to?(:close)
|
163
|
-
@db.close
|
161
|
+
@db.close
|
164
162
|
when @db.respond_to?(:quit)
|
165
163
|
@db.quit
|
166
164
|
end
|
167
|
-
|
165
|
+
|
168
166
|
@db = nil
|
169
167
|
end
|
170
|
-
|
168
|
+
|
171
169
|
def path
|
172
170
|
case options[:mode]
|
173
171
|
when :kyoto
|
@@ -178,13 +176,13 @@ module Anystyle
|
|
178
176
|
'hash'
|
179
177
|
end
|
180
178
|
end
|
181
|
-
|
179
|
+
|
182
180
|
private
|
183
|
-
|
181
|
+
|
184
182
|
def db
|
185
183
|
@db || open
|
186
184
|
end
|
187
|
-
|
185
|
+
|
188
186
|
def populate
|
189
187
|
require 'zlib'
|
190
188
|
|
@@ -193,7 +191,7 @@ module Anystyle
|
|
193
191
|
|
194
192
|
Zlib::GzipReader.new(f).each do |line|
|
195
193
|
line.strip!
|
196
|
-
|
194
|
+
|
197
195
|
if line.start_with?('#')
|
198
196
|
case line
|
199
197
|
when /^## male/i
|
@@ -214,7 +212,7 @@ module Anystyle
|
|
214
212
|
# skip comments
|
215
213
|
end
|
216
214
|
else
|
217
|
-
key
|
215
|
+
key = line.split(/\s+(\d+\.\d+)\s*$/)[0]
|
218
216
|
value = self[key]
|
219
217
|
self[key] = value + mode if value < mode
|
220
218
|
end
|
@@ -222,8 +220,8 @@ module Anystyle
|
|
222
220
|
end
|
223
221
|
|
224
222
|
end
|
225
|
-
|
223
|
+
|
226
224
|
end
|
227
|
-
|
225
|
+
|
228
226
|
end
|
229
|
-
end
|
227
|
+
end
|