anystyle-parser 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +28 -12
- data/HISTORY.md +6 -0
- data/LICENSE +2 -2
- data/README.md +11 -11
- data/Rakefile +14 -3
- data/anystyle-parser.gemspec +13 -8
- data/features/support/env.rb +18 -0
- data/lib/anystyle/parser/dictionary.rb +35 -37
- data/lib/anystyle/parser/errors.rb +18 -18
- data/lib/anystyle/parser/parser.rb +254 -244
- data/lib/anystyle/parser/utility.rb +18 -18
- data/lib/anystyle/parser/version.rb +1 -1
- data/spec/anystyle/parser/parser_spec.rb +119 -115
- data/spec/spec_helper.rb +9 -2
- metadata +26 -43
- data/.autotest +0 -0
- data/.gitignore +0 -5
- data/.rspec +0 -3
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 9723d7ea8fd46588c2487a72357c29fc9b1811ce
|
4
|
+
data.tar.gz: bfe339c9e7ab8883cbef7d8ad4de1a6aba433b53
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 18591aa6d5ab49057b57308ff4a38b0e99aa07b45acf244998b47e51653b892ca442a5def2f3ed547f74c4cf338355070749cd31ca44dec08d41d21b2f23912b
|
7
|
+
data.tar.gz: 2790d3bd5f4fa9a86aae3be03b3dbb9847816911a173186c898def47f28c243f3bf12ccdedb34e914bd5549f3bdaeed24fcc47c014b628f71b96dacc0d39846e
|
data/Gemfile
CHANGED
@@ -1,19 +1,19 @@
|
|
1
1
|
source 'https://rubygems.org'
|
2
2
|
gemspec
|
3
3
|
|
4
|
-
group :development do
|
5
|
-
|
6
|
-
|
7
|
-
|
4
|
+
group :development, :test do
|
5
|
+
gem 'rake'
|
6
|
+
gem 'cucumber'
|
7
|
+
gem 'rspec'
|
8
|
+
gem 'simplecov', '~>0.8', :require => false
|
9
|
+
gem 'rubinius-coverage', :platform => :rbx
|
10
|
+
gem 'coveralls', :require => false
|
8
11
|
end
|
9
12
|
|
10
|
-
group :
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
gem 'cucumber'
|
15
|
-
gem 'rspec'
|
16
|
-
gem 'ZenTest'
|
13
|
+
group :debug do
|
14
|
+
gem 'debugger', '~>1.6', :require => false, :platform => :mri
|
15
|
+
gem 'rubinius-compiler', '~>2.0', :require => false, :platform => :rbx
|
16
|
+
gem 'rubinius-debugger', '~>2.0', :require => false, :platform => :rbx
|
17
17
|
end
|
18
18
|
|
19
19
|
group :profile do
|
@@ -22,6 +22,22 @@ group :profile do
|
|
22
22
|
end
|
23
23
|
|
24
24
|
group :extra do
|
25
|
+
gem 'autotest-fsevent', :require => false
|
26
|
+
gem 'yard'
|
27
|
+
gem 'ZenTest'
|
28
|
+
end
|
29
|
+
|
30
|
+
group :redis do
|
31
|
+
gem 'redis'
|
32
|
+
gem 'hiredis'
|
33
|
+
end
|
34
|
+
|
35
|
+
group :kyoto do
|
25
36
|
gem 'kyotocabinet-ruby', :require => 'kyotocabinet'
|
26
|
-
|
37
|
+
end
|
38
|
+
|
39
|
+
platform :rbx do
|
40
|
+
gem 'rubysl', '~>2.0'
|
41
|
+
gem 'json', '~>1.8'
|
42
|
+
gem 'racc'
|
27
43
|
end
|
data/HISTORY.md
CHANGED
data/LICENSE
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
Copyright 2011-
|
1
|
+
Copyright 2011-2014 Sylvester Keil. All rights reserved.
|
2
2
|
|
3
3
|
Redistribution and use in source and binary forms, with or without
|
4
4
|
modification, are permitted provided that the following conditions are met:
|
@@ -23,4 +23,4 @@ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
23
23
|
|
24
24
|
The views and conclusions contained in the software and documentation are
|
25
25
|
those of the authors and should not be interpreted as representing official
|
26
|
-
policies, either expressed or implied, of the copyright holder.
|
26
|
+
policies, either expressed or implied, of the copyright holder.
|
data/README.md
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
Anystyle-Parser
|
2
2
|
===============
|
3
|
+
[](https://travis-ci.org/inukshuk/anystyle-parser)
|
4
|
+
[](https://coveralls.io/r/inukshuk/anystyle-parser)
|
3
5
|
|
4
6
|
Anystyle-Parser is a very fast and smart parser for academic references. It
|
5
7
|
is inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/) and
|
@@ -18,7 +20,7 @@ Installation
|
|
18
20
|
$ [sudo] gem install anystyle-parser
|
19
21
|
|
20
22
|
During the statistical analysis of reference strings, Anystyle-Parser relies
|
21
|
-
on a large feature dictionary; by default, Anystyle-Parser creates a
|
23
|
+
on a large feature dictionary; by default, Anystyle-Parser creates a
|
22
24
|
[Kyoto Cabinet](http://fallabs.com/kyotocabinet/) file-based hash database
|
23
25
|
from the dictionary file that ships with the parser. If Kyoto Cabinet is
|
24
26
|
not installed on your system, Anystyle-Parser uses a simple Ruby Hash as a
|
@@ -26,7 +28,7 @@ fall-back; this Hash has to be re-created every time you load the parser
|
|
26
28
|
and takes up a lot of memory in your Ruby process; it is therefore strongly
|
27
29
|
recommended to install Kyoto Cabinet and the `kyotocabinet-ruby` gem.
|
28
30
|
|
29
|
-
$ [sudo] gem install kyotocabinet-ruby
|
31
|
+
$ [sudo] gem install kyotocabinet-ruby
|
30
32
|
|
31
33
|
The database file will be created the first time you access the dictionary;
|
32
34
|
note that you will need write permissions in the directory where the file
|
@@ -39,8 +41,8 @@ Starting with version 0.1.0, Anystyle-Parser also supports
|
|
39
41
|
[Redis](http://redis.io); to use Redis as the data store you need to install
|
40
42
|
the `redis` gem (and, optionally, the `hiredis` gem).
|
41
43
|
|
42
|
-
$ [sudo] gem install hiredis
|
43
|
-
$ [sudo] gem install redis
|
44
|
+
$ [sudo] gem install hiredis
|
45
|
+
$ [sudo] gem install redis
|
44
46
|
|
45
47
|
To see which data store modes are available in you current environment,
|
46
48
|
check the output of `Dictionary.modes`:
|
@@ -52,7 +54,7 @@ To select one of the available modes, use the dictionary instance options:
|
|
52
54
|
|
53
55
|
> Anystyle::Parser::Dictionary.instance.options[:mode]
|
54
56
|
=> :kyoto
|
55
|
-
|
57
|
+
|
56
58
|
To use [Redis](http://redis.io) you also need to set the host or unix socket
|
57
59
|
where your redis server is available. For example:
|
58
60
|
|
@@ -84,7 +86,7 @@ Anystyle-Parser are `#parse` and `#train` that both accept two arguments.
|
|
84
86
|
`#parse` parses the passed-in input (either a filename, your reference strings,
|
85
87
|
or an array of your reference strings) and returns the parsed data in the
|
86
88
|
format specified as the second argument (supported formats include: *:hash*,
|
87
|
-
*:bibtex*, *:citeproc*, and *:
|
89
|
+
*:bibtex*, *:citeproc*, *:tags*, and *:raw*).
|
88
90
|
|
89
91
|
`#train` allows you to easily train the Parser's CRF model. The first argument
|
90
92
|
is either a filename or your data as a string; the format of training data
|
@@ -113,7 +115,7 @@ The following irb sessions illustrates some parser goodness:
|
|
113
115
|
year = {1997}
|
114
116
|
}
|
115
117
|
=> nil
|
116
|
-
|
118
|
+
|
117
119
|
### Unhappy with the results?
|
118
120
|
|
119
121
|
Citation references come in many forms, so, inevitably, you will find data
|
@@ -159,11 +161,10 @@ data again:
|
|
159
161
|
=> [{:author=>"John Lafferty and Andrew McCallum and Fernando Pereira", :title=>"Conditional random fields: probabilistic models for segmenting and labeling sequence data", :booktitle=>"Proceedings of the International Conference on Machine Learning", :pages=>"282--289", :publisher=>"Morgan Kaufmann", :location=>"San Francisco, CA", :year=>2001, :type=>:inproceedings}]
|
160
162
|
|
161
163
|
If you want to make Anystyle-Parser smarter, please consider sending us your
|
162
|
-
tagged references (see below).
|
164
|
+
tagged references (see below).
|
163
165
|
|
164
166
|
Contributing
|
165
167
|
------------
|
166
|
-
|
167
168
|
The Anystyle-Parser source code is
|
168
169
|
[hosted on GitHub](http://github.com/inukshuk/anystyle-parser/).
|
169
170
|
You can check out a copy of the latest code using Git:
|
@@ -182,8 +183,7 @@ and open a pull request on GitHub.
|
|
182
183
|
|
183
184
|
License
|
184
185
|
-------
|
185
|
-
|
186
|
-
Copyright 2011-2012 Sylvester Keil. All rights reserved.
|
186
|
+
Copyright 2011-2014 Sylvester Keil. All rights reserved.
|
187
187
|
|
188
188
|
Some of the code in Anystyle-Parser's post processing (normalizing) routines
|
189
189
|
was originally based on the source code of FreeCite and
|
data/Rakefile
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'bundler'
|
2
2
|
begin
|
3
|
-
Bundler.setup
|
3
|
+
Bundler.setup
|
4
4
|
rescue Bundler::BundlerError => e
|
5
5
|
$stderr.puts e.message
|
6
6
|
$stderr.puts "Run `bundle install` to install missing gems"
|
@@ -23,6 +23,13 @@ task :release => [:build] do
|
|
23
23
|
system "gem push anystyle-parser-#{Anystyle::Parser::VERSION}.gem"
|
24
24
|
end
|
25
25
|
|
26
|
+
task :check_warnings do
|
27
|
+
$VERBOSE = true
|
28
|
+
require 'anystyle/parser'
|
29
|
+
|
30
|
+
puts Anystyle::Parser::VERSION
|
31
|
+
end
|
32
|
+
|
26
33
|
require 'rspec/core'
|
27
34
|
require 'rspec/core/rake_task'
|
28
35
|
RSpec::Core::RakeTask.new(:spec) do |spec|
|
@@ -32,6 +39,10 @@ end
|
|
32
39
|
require 'cucumber/rake/task'
|
33
40
|
Cucumber::Rake::Task.new(:features)
|
34
41
|
|
42
|
+
require 'coveralls/rake/task'
|
43
|
+
Coveralls::RakeTask.new
|
44
|
+
task :test_with_coveralls => [:spec, 'coveralls:push']
|
45
|
+
|
35
46
|
task :default => :spec
|
36
47
|
|
37
48
|
begin
|
@@ -41,13 +52,13 @@ rescue LoadError
|
|
41
52
|
# ignore
|
42
53
|
end
|
43
54
|
|
44
|
-
desc 'Run an IRB session with
|
55
|
+
desc 'Run an IRB session with Anystyle-Parser loaded'
|
45
56
|
task :console, [:script] do |t, args|
|
46
57
|
ARGV.clear
|
47
58
|
|
48
59
|
require 'irb'
|
49
60
|
require 'anystyle/parser'
|
50
|
-
|
61
|
+
|
51
62
|
IRB.conf[:SCRIPT] = args.script
|
52
63
|
IRB.start
|
53
64
|
end
|
data/anystyle-parser.gemspec
CHANGED
@@ -11,22 +11,27 @@ Gem::Specification.new do |s|
|
|
11
11
|
s.authors = ['Sylvester Keil']
|
12
12
|
s.email = ['http://sylvester.keil.or.at']
|
13
13
|
s.homepage = 'http://github.com/inukshuk/anystyle-parser'
|
14
|
-
s.summary = '
|
15
|
-
s.description = 'A sophisticated parser for academic
|
14
|
+
s.summary = 'Smart and fast academic bibliography parser.'
|
15
|
+
s.description = 'A sophisticated parser for academic reference lists and bibliographies based on machine learning algorithms using conditional random fields.'
|
16
16
|
s.license = 'FreeBSD'
|
17
|
-
|
18
|
-
s.
|
17
|
+
|
18
|
+
s.required_ruby_version = '>= 1.9.3'
|
19
|
+
|
20
|
+
s.add_runtime_dependency('bibtex-ruby', '~>3.0')
|
19
21
|
s.add_runtime_dependency('wapiti', '~>0.0')
|
20
|
-
s.add_runtime_dependency('namae', '~>0.
|
22
|
+
s.add_runtime_dependency('namae', '~>0.8')
|
23
|
+
|
24
|
+
s.files = `git ls-files`.split("\n").reject { |path|
|
25
|
+
path.start_with?('.')
|
26
|
+
} - Dir['resources/**/*']
|
21
27
|
|
22
|
-
s.files = `git ls-files`.split("\n") - Dir['resources/**/*']
|
23
28
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
24
29
|
s.executables = []
|
25
30
|
s.require_path = 'lib'
|
26
31
|
|
27
32
|
s.rdoc_options = %w{--line-numbers --inline-source --title "Anystyle\ Parser" --main README.md}
|
28
33
|
s.extra_rdoc_files = %w{README.md LICENSE}
|
29
|
-
|
34
|
+
|
30
35
|
end
|
31
36
|
|
32
|
-
# vim: syntax=ruby
|
37
|
+
# vim: syntax=ruby
|
data/features/support/env.rb
CHANGED
@@ -1 +1,19 @@
|
|
1
|
+
begin
|
2
|
+
require 'simplecov'
|
3
|
+
require 'coveralls' if ENV['CI']
|
4
|
+
rescue LoadError
|
5
|
+
# ignore
|
6
|
+
end
|
7
|
+
|
8
|
+
begin
|
9
|
+
case
|
10
|
+
when defined?(RUBY_ENGINE) && RUBY_ENGINE == 'rbx'
|
11
|
+
require 'rubinius/debugger'
|
12
|
+
else
|
13
|
+
require 'debugger'
|
14
|
+
end
|
15
|
+
rescue LoadError
|
16
|
+
# ignore
|
17
|
+
end
|
18
|
+
|
1
19
|
require 'anystyle/parser'
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Anystyle
|
2
2
|
module Parser
|
3
|
-
|
3
|
+
|
4
4
|
# Dictionary is a Singleton object that provides a key-value store of
|
5
5
|
# the Anystyle Parser dictionary required for feature elicitation.
|
6
6
|
# This dictionary acts essentially like a Ruby Hash object, but because
|
@@ -11,8 +11,8 @@ module Anystyle
|
|
11
11
|
#
|
12
12
|
# Starting with version 0.1.0 Redis support was added. If you would
|
13
13
|
# like to use Redis as the dictionary data store you can do so by
|
14
|
-
#
|
15
|
-
#
|
14
|
+
# installing `redis' gem (and optionally the `hiredis' gem).
|
15
|
+
#
|
16
16
|
# The database will be automatically created from the dictionary file
|
17
17
|
# using the best available DBM the first time it is accessed. Once
|
18
18
|
# database file exists, the database will be restored from file.
|
@@ -46,7 +46,7 @@ module Anystyle
|
|
46
46
|
class Dictionary
|
47
47
|
|
48
48
|
include Singleton
|
49
|
-
|
49
|
+
|
50
50
|
@keys = [:male, :female, :surname, :month, :place, :publisher, :journal].freeze
|
51
51
|
|
52
52
|
@code = Hash[*@keys.zip(0.upto(@keys.length-1).map { |i| 2**i }).flatten]
|
@@ -67,42 +67,40 @@ module Anystyle
|
|
67
67
|
rescue LoadError
|
68
68
|
# info 'no redis support detected'
|
69
69
|
end
|
70
|
-
|
70
|
+
|
71
71
|
begin
|
72
72
|
require 'kyotocabinet'
|
73
73
|
@modes.unshift :kyoto
|
74
74
|
rescue LoadError
|
75
75
|
# info 'no kyoto-cabinet support detected'
|
76
76
|
end
|
77
|
-
|
77
|
+
|
78
78
|
@defaults = {
|
79
79
|
:mode => @modes[0],
|
80
80
|
:source => File.expand_path('../support/dict.txt.gz', __FILE__),
|
81
81
|
:cabinet => File.expand_path('../support/dict.kch', __FILE__),
|
82
82
|
:port => 6379
|
83
83
|
}.freeze
|
84
|
-
|
85
|
-
|
84
|
+
|
85
|
+
|
86
86
|
class << self
|
87
|
-
|
88
87
|
attr_reader :keys, :code, :defaults, :modes
|
89
|
-
|
90
88
|
end
|
91
89
|
|
92
90
|
attr_reader :options
|
93
|
-
|
91
|
+
|
94
92
|
def initialize
|
95
93
|
@options = Dictionary.defaults.dup
|
96
94
|
end
|
97
|
-
|
95
|
+
|
98
96
|
def [](key)
|
99
97
|
db[key.to_s].to_i
|
100
98
|
end
|
101
|
-
|
99
|
+
|
102
100
|
def []=(key, value)
|
103
101
|
db[key.to_s] = value
|
104
102
|
end
|
105
|
-
|
103
|
+
|
106
104
|
def create
|
107
105
|
case options[:mode]
|
108
106
|
when :kyoto
|
@@ -113,61 +111,61 @@ module Anystyle
|
|
113
111
|
end
|
114
112
|
populate
|
115
113
|
close
|
116
|
-
|
114
|
+
|
117
115
|
when :redis
|
118
116
|
@db ||= Redis.new(options)
|
119
117
|
populate
|
120
118
|
close
|
121
|
-
|
119
|
+
|
122
120
|
else
|
123
121
|
# nothing
|
124
122
|
end
|
125
123
|
end
|
126
|
-
|
124
|
+
|
127
125
|
def truncate
|
128
126
|
close
|
129
|
-
File.unlink(path) if File.exists?(path)
|
127
|
+
File.unlink(path) if File.exists?(path)
|
130
128
|
end
|
131
|
-
|
129
|
+
|
132
130
|
def open
|
133
131
|
case options[:mode]
|
134
132
|
when :kyoto
|
135
133
|
at_exit { ::Anystyle::Parser::Dictionary.instance.close }
|
136
134
|
|
137
135
|
create unless File.exists?(path)
|
138
|
-
|
136
|
+
|
139
137
|
@db = KyotoCabinet::DB.new
|
140
138
|
unless @db.open(path, KyotoCabinet::DB::OREADER)
|
141
139
|
raise DictionaryError, "failed to open cabinet file #{path}: #{@db.error}"
|
142
140
|
end
|
143
|
-
|
141
|
+
|
144
142
|
when :redis
|
145
143
|
at_exit { ::Anystyle::Parser::Dictionary.instance.close }
|
146
144
|
@db = Redis.new(options)
|
147
|
-
|
145
|
+
|
148
146
|
populate if @db.dbsize.zero?
|
149
|
-
|
147
|
+
|
150
148
|
else
|
151
149
|
@db = Hash.new(0)
|
152
150
|
populate
|
153
151
|
end
|
154
|
-
|
152
|
+
|
155
153
|
@db
|
156
154
|
end
|
157
|
-
|
155
|
+
|
158
156
|
def open?; !!@db; end
|
159
|
-
|
157
|
+
|
160
158
|
def close
|
161
159
|
case
|
162
160
|
when @db.respond_to?(:close)
|
163
|
-
@db.close
|
161
|
+
@db.close
|
164
162
|
when @db.respond_to?(:quit)
|
165
163
|
@db.quit
|
166
164
|
end
|
167
|
-
|
165
|
+
|
168
166
|
@db = nil
|
169
167
|
end
|
170
|
-
|
168
|
+
|
171
169
|
def path
|
172
170
|
case options[:mode]
|
173
171
|
when :kyoto
|
@@ -178,13 +176,13 @@ module Anystyle
|
|
178
176
|
'hash'
|
179
177
|
end
|
180
178
|
end
|
181
|
-
|
179
|
+
|
182
180
|
private
|
183
|
-
|
181
|
+
|
184
182
|
def db
|
185
183
|
@db || open
|
186
184
|
end
|
187
|
-
|
185
|
+
|
188
186
|
def populate
|
189
187
|
require 'zlib'
|
190
188
|
|
@@ -193,7 +191,7 @@ module Anystyle
|
|
193
191
|
|
194
192
|
Zlib::GzipReader.new(f).each do |line|
|
195
193
|
line.strip!
|
196
|
-
|
194
|
+
|
197
195
|
if line.start_with?('#')
|
198
196
|
case line
|
199
197
|
when /^## male/i
|
@@ -214,7 +212,7 @@ module Anystyle
|
|
214
212
|
# skip comments
|
215
213
|
end
|
216
214
|
else
|
217
|
-
key
|
215
|
+
key = line.split(/\s+(\d+\.\d+)\s*$/)[0]
|
218
216
|
value = self[key]
|
219
217
|
self[key] = value + mode if value < mode
|
220
218
|
end
|
@@ -222,8 +220,8 @@ module Anystyle
|
|
222
220
|
end
|
223
221
|
|
224
222
|
end
|
225
|
-
|
223
|
+
|
226
224
|
end
|
227
|
-
|
225
|
+
|
228
226
|
end
|
229
|
-
end
|
227
|
+
end
|