anystyle-parser 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +13 -7
- data/HISTORY.md +4 -0
- data/README.md +1 -1
- data/Rakefile +41 -3
- data/anystyle-parser.gemspec +1 -6
- data/cucumber.yml +1 -0
- data/lib/anystyle/parser/normalizer.rb +4 -36
- data/lib/anystyle/parser/version.rb +1 -1
- data/lib/anystyle/parser.rb +1 -3
- data/spec/anystyle/parser/normalizer_spec.rb +20 -20
- data/spec/spec_helper.rb +23 -0
- metadata +22 -49
data/Gemfile
CHANGED
@@ -1,14 +1,19 @@
|
|
1
1
|
source :rubygems
|
2
2
|
gemspec
|
3
3
|
|
4
|
-
group :
|
5
|
-
gem '
|
6
|
-
gem '
|
7
|
-
gem '
|
4
|
+
group :development do
|
5
|
+
gem 'debugger', :platforms => [:mri_19]
|
6
|
+
gem 'simplecov'
|
7
|
+
gem 'yard'
|
8
8
|
end
|
9
9
|
|
10
|
-
group :
|
11
|
-
gem '
|
10
|
+
group :test do
|
11
|
+
gem 'rake'
|
12
|
+
gem 'racc', '~>1.4'
|
13
|
+
|
14
|
+
gem 'cucumber'
|
15
|
+
gem 'rspec'
|
16
|
+
gem 'ZenTest'
|
12
17
|
end
|
13
18
|
|
14
19
|
group :profile do
|
@@ -16,6 +21,7 @@ group :profile do
|
|
16
21
|
gem 'gnuplot'
|
17
22
|
end
|
18
23
|
|
19
|
-
group :
|
24
|
+
group :extra do
|
20
25
|
gem 'kyotocabinet-ruby', :require => 'kyotocabinet'
|
26
|
+
gem 'autotest-fsevent', :require => false
|
21
27
|
end
|
data/HISTORY.md
CHANGED
data/README.md
CHANGED
@@ -5,7 +5,7 @@ Anystyle-Parser is a very fast and smart parser for academic references. It
|
|
5
5
|
is inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/) and
|
6
6
|
[FreeCite](http://freecite.library.brown.edu/); Anystyle-Parser uses machine
|
7
7
|
learning algorithms and is designed
|
8
|
-
for raw speed (it [wapiti](https://github.com/inukshuk/wapiti-ruby) based
|
8
|
+
for raw speed (it uses [wapiti](https://github.com/inukshuk/wapiti-ruby) based
|
9
9
|
conditional random fields and [Kyoto Cabinet](http://fallabs.com/kyotocabinet/)
|
10
10
|
or [Redis](http://redis.io) as a key-value store), flexibility (it is easy to
|
11
11
|
train the model with data that is relevant to your parsing needs), and
|
data/Rakefile
CHANGED
@@ -1,10 +1,19 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
require 'bundler'
|
2
|
+
begin
|
3
|
+
Bundler.setup(:default, :development, :debug, :test, :extra, :profile)
|
4
|
+
rescue Bundler::BundlerError => e
|
5
|
+
$stderr.puts e.message
|
6
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
7
|
+
exit e.status_code
|
8
|
+
end
|
9
|
+
require 'rake'
|
4
10
|
require 'rake/clean'
|
5
11
|
|
12
|
+
$:.unshift(File.join(File.dirname(__FILE__), './lib'))
|
13
|
+
|
6
14
|
require 'anystyle/parser/version'
|
7
15
|
|
16
|
+
task :default
|
8
17
|
task :build => [:clean] do
|
9
18
|
system 'gem build anystyle-parser.gemspec'
|
10
19
|
end
|
@@ -14,5 +23,34 @@ task :release => [:build] do
|
|
14
23
|
system "gem push anystyle-parser-#{Anystyle::Parser::VERSION}.gem"
|
15
24
|
end
|
16
25
|
|
26
|
+
require 'rspec/core'
|
27
|
+
require 'rspec/core/rake_task'
|
28
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
29
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
30
|
+
end
|
31
|
+
|
32
|
+
require 'cucumber/rake/task'
|
33
|
+
Cucumber::Rake::Task.new(:features)
|
34
|
+
|
35
|
+
task :default => :spec
|
36
|
+
|
37
|
+
begin
|
38
|
+
require 'yard'
|
39
|
+
YARD::Rake::YardocTask.new
|
40
|
+
rescue LoadError
|
41
|
+
# ignore
|
42
|
+
end
|
43
|
+
|
44
|
+
desc 'Run an IRB session with CSL loaded'
|
45
|
+
task :console, [:script] do |t, args|
|
46
|
+
ARGV.clear
|
47
|
+
|
48
|
+
require 'irb'
|
49
|
+
require 'anystyle/parser'
|
50
|
+
|
51
|
+
IRB.conf[:SCRIPT] = args.script
|
52
|
+
IRB.start
|
53
|
+
end
|
54
|
+
|
17
55
|
CLEAN.include('*.gem')
|
18
56
|
CLEAN.include('*.rbc')
|
data/anystyle-parser.gemspec
CHANGED
@@ -17,12 +17,7 @@ Gem::Specification.new do |s|
|
|
17
17
|
|
18
18
|
s.add_runtime_dependency('bibtex-ruby', '~>2.0')
|
19
19
|
s.add_runtime_dependency('wapiti', '~>0.0')
|
20
|
-
|
21
|
-
s.add_development_dependency('rake', ['~>0.9'])
|
22
|
-
s.add_development_dependency('racc', ['~>1.4'])
|
23
|
-
s.add_development_dependency('cucumber', ['~>1.0'])
|
24
|
-
s.add_development_dependency('rspec', ['~>2.6'])
|
25
|
-
s.add_development_dependency('ZenTest', ['~>4.6'])
|
20
|
+
s.add_runtime_dependency('namae', '~>0.7')
|
26
21
|
|
27
22
|
s.files = `git ls-files`.split("\n") - Dir['resources/**/*']
|
28
23
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
data/cucumber.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
default: --format progress --require features --color
|
@@ -129,47 +129,15 @@ module Anystyle
|
|
129
129
|
hash
|
130
130
|
end
|
131
131
|
|
132
|
+
Namae::Parser.instance.options[:prefer_comma_as_separator] = true
|
133
|
+
|
132
134
|
def normalize_names(names)
|
133
|
-
|
134
|
-
name.strip!
|
135
|
-
name.gsub!(/\b([[:upper:]]{2,3})\b/) { $1.split(//).join(' ') }
|
136
|
-
name.gsub!(/\b([[:upper:]])(\s|$)/) { [$1, $2 == ?. ? nil : ?., $2].compact.join }
|
137
|
-
name
|
138
|
-
end
|
139
|
-
names.join(' and ')
|
135
|
+
Namae.parse!(names).map(&:sort_order).join(' and ')
|
140
136
|
rescue => e
|
141
137
|
warn e.message
|
142
138
|
hash
|
143
139
|
end
|
144
|
-
|
145
|
-
def tokenize_names(names)
|
146
|
-
s, n, ns, cc = StringScanner.new(names), '', [], 0
|
147
|
-
until s.eos?
|
148
|
-
case
|
149
|
-
when s.scan(/,?\s*(and\b|&|;)/)
|
150
|
-
ns << n
|
151
|
-
n, cc = '', 0
|
152
|
-
when s.scan(/\s+/)
|
153
|
-
n << ' '
|
154
|
-
when s.scan(/,?\s*(jr|sr|ph\.?d|m\.?d|esq)\.?/i)
|
155
|
-
n << s.matched
|
156
|
-
when s.scan(/,/)
|
157
|
-
if cc > 0 || (n =~ /\S{2,}\s+\S{2,}/ && s.rest !~ /^\s*[[:alpha:]]+(\.|,|$)/)
|
158
|
-
ns << n
|
159
|
-
n, cc = '', 0
|
160
|
-
else
|
161
|
-
n << s.matched
|
162
|
-
cc += 1
|
163
|
-
end
|
164
|
-
when s.scan(/[[:alpha:]]+/)
|
165
|
-
n << s.matched
|
166
|
-
when s.scan(/./)
|
167
|
-
n << s.matched
|
168
|
-
end
|
169
|
-
end
|
170
|
-
ns << n
|
171
|
-
end
|
172
|
-
|
140
|
+
|
173
141
|
def normalize_title(hash)
|
174
142
|
title, container = hash[:title]
|
175
143
|
|
data/lib/anystyle/parser.rb
CHANGED
@@ -6,46 +6,46 @@ module Anystyle
|
|
6
6
|
describe "#tokenize_names" do
|
7
7
|
|
8
8
|
it "tokenizes 'A B'" do
|
9
|
-
Normalizer.instance.
|
9
|
+
Normalizer.instance.normalize_names('A B').should == 'B, A'
|
10
10
|
end
|
11
11
|
|
12
12
|
it "tokenizes 'A, B'" do
|
13
|
-
Normalizer.instance.
|
13
|
+
Normalizer.instance.normalize_names('A, B').should == 'A, B'
|
14
14
|
end
|
15
15
|
|
16
|
-
it "tokenizes 'A, jr., B'" do
|
17
|
-
|
18
|
-
end
|
19
|
-
|
20
|
-
it "tokenizes 'A, B, jr.'" do
|
21
|
-
|
22
|
-
end
|
16
|
+
# it "tokenizes 'A, jr., B'" do
|
17
|
+
# Normalizer.instance.normalize_names('A, jr., B').should == 'A, jr., B'
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# it "tokenizes 'A, B, jr.'" do
|
21
|
+
# Normalizer.instance.normalize_names('A, B, jr.').should == 'A, B, jr.'
|
22
|
+
# end
|
23
23
|
|
24
24
|
it "tokenizes 'A, B, C, D'" do
|
25
|
-
Normalizer.instance.
|
25
|
+
Normalizer.instance.normalize_names('A, B, C, D').should == 'A, B and C, D'
|
26
26
|
end
|
27
27
|
|
28
28
|
it "tokenizes 'A, B, C'" do
|
29
|
-
Normalizer.instance.
|
29
|
+
Normalizer.instance.normalize_names('A, B, C').should == 'A, B and C'
|
30
30
|
end
|
31
31
|
|
32
32
|
it "tokenizes 'Aa Bb, C.'" do
|
33
|
-
Normalizer.instance.
|
33
|
+
Normalizer.instance.normalize_names('Aa Bb, C.').should == 'Aa Bb, C.'
|
34
34
|
end
|
35
35
|
|
36
36
|
it "tokenizes 'Aa Bb, Cc Dd, and E F G'" do
|
37
|
-
Normalizer.instance.
|
37
|
+
Normalizer.instance.normalize_names('Aa Bb, C D, and E F G').should == 'Bb, Aa and D, C and G, E F'
|
38
38
|
end
|
39
39
|
|
40
40
|
[
|
41
|
-
['Poe, Edgar A.',
|
42
|
-
['Edgar A. Poe',
|
43
|
-
['Edgar A. Poe, Herman Melville',
|
44
|
-
['Poe, Edgar A., Melville, Herman',
|
45
|
-
['Aeschlimann Magnin, E.',
|
46
|
-
].each do |name,
|
41
|
+
['Poe, Edgar A.', 'Poe, Edgar A.'],
|
42
|
+
['Edgar A. Poe', 'Poe, Edgar A.'],
|
43
|
+
['Edgar A. Poe, Herman Melville', 'Poe, Edgar A. and Melville, Herman'],
|
44
|
+
['Poe, Edgar A., Melville, Herman', 'Poe, Edgar A. and Melville, Herman'],
|
45
|
+
['Aeschlimann Magnin, E.', 'Aeschlimann Magnin, E.']
|
46
|
+
].each do |name, normalized|
|
47
47
|
it "tokenizes #{name.inspect}" do
|
48
|
-
Normalizer.instance.
|
48
|
+
Normalizer.instance.normalize_names(name).should == normalized
|
49
49
|
end
|
50
50
|
end
|
51
51
|
|
data/spec/spec_helper.rb
CHANGED
@@ -1 +1,24 @@
|
|
1
|
+
begin
|
2
|
+
require 'simplecov'
|
3
|
+
rescue LoadError
|
4
|
+
# ignore
|
5
|
+
end
|
6
|
+
|
7
|
+
begin
|
8
|
+
require 'debugger'
|
9
|
+
rescue LoadError
|
10
|
+
# ignore
|
11
|
+
end
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'rspec'
|
1
16
|
require 'anystyle/parser'
|
17
|
+
|
18
|
+
# Requires supporting files with custom matchers and macros, etc,
|
19
|
+
# in ./support/ and its subdirectories.
|
20
|
+
Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
|
21
|
+
|
22
|
+
RSpec.configure do |config|
|
23
|
+
|
24
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anystyle-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-10-29 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bibtex-ruby
|
16
|
-
requirement:
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,73 +21,44 @@ dependencies:
|
|
21
21
|
version: '2.0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
25
|
-
- !ruby/object:Gem::Dependency
|
26
|
-
name: wapiti
|
27
|
-
requirement: &70180181145980 !ruby/object:Gem::Requirement
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
28
25
|
none: false
|
29
26
|
requirements:
|
30
27
|
- - ~>
|
31
28
|
- !ruby/object:Gem::Version
|
32
|
-
version: '
|
33
|
-
type: :runtime
|
34
|
-
prerelease: false
|
35
|
-
version_requirements: *70180181145980
|
36
|
-
- !ruby/object:Gem::Dependency
|
37
|
-
name: rake
|
38
|
-
requirement: &70180181145180 !ruby/object:Gem::Requirement
|
39
|
-
none: false
|
40
|
-
requirements:
|
41
|
-
- - ~>
|
42
|
-
- !ruby/object:Gem::Version
|
43
|
-
version: '0.9'
|
44
|
-
type: :development
|
45
|
-
prerelease: false
|
46
|
-
version_requirements: *70180181145180
|
29
|
+
version: '2.0'
|
47
30
|
- !ruby/object:Gem::Dependency
|
48
|
-
name:
|
49
|
-
requirement:
|
31
|
+
name: wapiti
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
50
33
|
none: false
|
51
34
|
requirements:
|
52
35
|
- - ~>
|
53
36
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
55
|
-
type: :
|
37
|
+
version: '0.0'
|
38
|
+
type: :runtime
|
56
39
|
prerelease: false
|
57
|
-
version_requirements:
|
58
|
-
- !ruby/object:Gem::Dependency
|
59
|
-
name: cucumber
|
60
|
-
requirement: &70180181143520 !ruby/object:Gem::Requirement
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
61
41
|
none: false
|
62
42
|
requirements:
|
63
43
|
- - ~>
|
64
44
|
- !ruby/object:Gem::Version
|
65
|
-
version: '
|
66
|
-
type: :development
|
67
|
-
prerelease: false
|
68
|
-
version_requirements: *70180181143520
|
45
|
+
version: '0.0'
|
69
46
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
71
|
-
requirement:
|
47
|
+
name: namae
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
72
49
|
none: false
|
73
50
|
requirements:
|
74
51
|
- - ~>
|
75
52
|
- !ruby/object:Gem::Version
|
76
|
-
version: '
|
77
|
-
type: :
|
53
|
+
version: '0.7'
|
54
|
+
type: :runtime
|
78
55
|
prerelease: false
|
79
|
-
version_requirements:
|
80
|
-
- !ruby/object:Gem::Dependency
|
81
|
-
name: ZenTest
|
82
|
-
requirement: &70180181142000 !ruby/object:Gem::Requirement
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
83
57
|
none: false
|
84
58
|
requirements:
|
85
59
|
- - ~>
|
86
60
|
- !ruby/object:Gem::Version
|
87
|
-
version: '
|
88
|
-
type: :development
|
89
|
-
prerelease: false
|
90
|
-
version_requirements: *70180181142000
|
61
|
+
version: '0.7'
|
91
62
|
description: A sophisticated parser for academic references based on machine learning
|
92
63
|
algorithms using conditional random fields.
|
93
64
|
email:
|
@@ -107,6 +78,7 @@ files:
|
|
107
78
|
- README.md
|
108
79
|
- Rakefile
|
109
80
|
- anystyle-parser.gemspec
|
81
|
+
- cucumber.yml
|
110
82
|
- features/step_definitions/parser_steps.rb
|
111
83
|
- features/support/env.rb
|
112
84
|
- lib/anystyle/parser.rb
|
@@ -148,7 +120,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
148
120
|
version: '0'
|
149
121
|
segments:
|
150
122
|
- 0
|
151
|
-
hash: -
|
123
|
+
hash: -4454145497019098220
|
152
124
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
153
125
|
none: false
|
154
126
|
requirements:
|
@@ -157,10 +129,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
157
129
|
version: '0'
|
158
130
|
segments:
|
159
131
|
- 0
|
160
|
-
hash: -
|
132
|
+
hash: -4454145497019098220
|
161
133
|
requirements: []
|
162
134
|
rubyforge_project:
|
163
|
-
rubygems_version: 1.8.
|
135
|
+
rubygems_version: 1.8.24
|
164
136
|
signing_key:
|
165
137
|
specification_version: 3
|
166
138
|
summary: Parser for academic references.
|
@@ -174,3 +146,4 @@ test_files:
|
|
174
146
|
- spec/benchmark.rb
|
175
147
|
- spec/profile.rb
|
176
148
|
- spec/spec_helper.rb
|
149
|
+
has_rdoc:
|