anystyle-parser 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +13 -7
- data/HISTORY.md +4 -0
- data/README.md +1 -1
- data/Rakefile +41 -3
- data/anystyle-parser.gemspec +1 -6
- data/cucumber.yml +1 -0
- data/lib/anystyle/parser/normalizer.rb +4 -36
- data/lib/anystyle/parser/version.rb +1 -1
- data/lib/anystyle/parser.rb +1 -3
- data/spec/anystyle/parser/normalizer_spec.rb +20 -20
- data/spec/spec_helper.rb +23 -0
- metadata +22 -49
data/Gemfile
CHANGED
@@ -1,14 +1,19 @@
|
|
1
1
|
source :rubygems
|
2
2
|
gemspec
|
3
3
|
|
4
|
-
group :
|
5
|
-
gem '
|
6
|
-
gem '
|
7
|
-
gem '
|
4
|
+
group :development do
|
5
|
+
gem 'debugger', :platforms => [:mri_19]
|
6
|
+
gem 'simplecov'
|
7
|
+
gem 'yard'
|
8
8
|
end
|
9
9
|
|
10
|
-
group :
|
11
|
-
gem '
|
10
|
+
group :test do
|
11
|
+
gem 'rake'
|
12
|
+
gem 'racc', '~>1.4'
|
13
|
+
|
14
|
+
gem 'cucumber'
|
15
|
+
gem 'rspec'
|
16
|
+
gem 'ZenTest'
|
12
17
|
end
|
13
18
|
|
14
19
|
group :profile do
|
@@ -16,6 +21,7 @@ group :profile do
|
|
16
21
|
gem 'gnuplot'
|
17
22
|
end
|
18
23
|
|
19
|
-
group :
|
24
|
+
group :extra do
|
20
25
|
gem 'kyotocabinet-ruby', :require => 'kyotocabinet'
|
26
|
+
gem 'autotest-fsevent', :require => false
|
21
27
|
end
|
data/HISTORY.md
CHANGED
data/README.md
CHANGED
@@ -5,7 +5,7 @@ Anystyle-Parser is a very fast and smart parser for academic references. It
|
|
5
5
|
is inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/) and
|
6
6
|
[FreeCite](http://freecite.library.brown.edu/); Anystyle-Parser uses machine
|
7
7
|
learning algorithms and is designed
|
8
|
-
for raw speed (it [wapiti](https://github.com/inukshuk/wapiti-ruby) based
|
8
|
+
for raw speed (it uses [wapiti](https://github.com/inukshuk/wapiti-ruby) based
|
9
9
|
conditional random fields and [Kyoto Cabinet](http://fallabs.com/kyotocabinet/)
|
10
10
|
or [Redis](http://redis.io) as a key-value store), flexibility (it is easy to
|
11
11
|
train the model with data that is relevant to your parsing needs), and
|
data/Rakefile
CHANGED
@@ -1,10 +1,19 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
require 'bundler'
|
2
|
+
begin
|
3
|
+
Bundler.setup(:default, :development, :debug, :test, :extra, :profile)
|
4
|
+
rescue Bundler::BundlerError => e
|
5
|
+
$stderr.puts e.message
|
6
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
7
|
+
exit e.status_code
|
8
|
+
end
|
9
|
+
require 'rake'
|
4
10
|
require 'rake/clean'
|
5
11
|
|
12
|
+
$:.unshift(File.join(File.dirname(__FILE__), './lib'))
|
13
|
+
|
6
14
|
require 'anystyle/parser/version'
|
7
15
|
|
16
|
+
task :default
|
8
17
|
task :build => [:clean] do
|
9
18
|
system 'gem build anystyle-parser.gemspec'
|
10
19
|
end
|
@@ -14,5 +23,34 @@ task :release => [:build] do
|
|
14
23
|
system "gem push anystyle-parser-#{Anystyle::Parser::VERSION}.gem"
|
15
24
|
end
|
16
25
|
|
26
|
+
require 'rspec/core'
|
27
|
+
require 'rspec/core/rake_task'
|
28
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
29
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
30
|
+
end
|
31
|
+
|
32
|
+
require 'cucumber/rake/task'
|
33
|
+
Cucumber::Rake::Task.new(:features)
|
34
|
+
|
35
|
+
task :default => :spec
|
36
|
+
|
37
|
+
begin
|
38
|
+
require 'yard'
|
39
|
+
YARD::Rake::YardocTask.new
|
40
|
+
rescue LoadError
|
41
|
+
# ignore
|
42
|
+
end
|
43
|
+
|
44
|
+
desc 'Run an IRB session with CSL loaded'
|
45
|
+
task :console, [:script] do |t, args|
|
46
|
+
ARGV.clear
|
47
|
+
|
48
|
+
require 'irb'
|
49
|
+
require 'anystyle/parser'
|
50
|
+
|
51
|
+
IRB.conf[:SCRIPT] = args.script
|
52
|
+
IRB.start
|
53
|
+
end
|
54
|
+
|
17
55
|
CLEAN.include('*.gem')
|
18
56
|
CLEAN.include('*.rbc')
|
data/anystyle-parser.gemspec
CHANGED
@@ -17,12 +17,7 @@ Gem::Specification.new do |s|
|
|
17
17
|
|
18
18
|
s.add_runtime_dependency('bibtex-ruby', '~>2.0')
|
19
19
|
s.add_runtime_dependency('wapiti', '~>0.0')
|
20
|
-
|
21
|
-
s.add_development_dependency('rake', ['~>0.9'])
|
22
|
-
s.add_development_dependency('racc', ['~>1.4'])
|
23
|
-
s.add_development_dependency('cucumber', ['~>1.0'])
|
24
|
-
s.add_development_dependency('rspec', ['~>2.6'])
|
25
|
-
s.add_development_dependency('ZenTest', ['~>4.6'])
|
20
|
+
s.add_runtime_dependency('namae', '~>0.7')
|
26
21
|
|
27
22
|
s.files = `git ls-files`.split("\n") - Dir['resources/**/*']
|
28
23
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
data/cucumber.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
default: --format progress --require features --color
|
@@ -129,47 +129,15 @@ module Anystyle
|
|
129
129
|
hash
|
130
130
|
end
|
131
131
|
|
132
|
+
Namae::Parser.instance.options[:prefer_comma_as_separator] = true
|
133
|
+
|
132
134
|
def normalize_names(names)
|
133
|
-
|
134
|
-
name.strip!
|
135
|
-
name.gsub!(/\b([[:upper:]]{2,3})\b/) { $1.split(//).join(' ') }
|
136
|
-
name.gsub!(/\b([[:upper:]])(\s|$)/) { [$1, $2 == ?. ? nil : ?., $2].compact.join }
|
137
|
-
name
|
138
|
-
end
|
139
|
-
names.join(' and ')
|
135
|
+
Namae.parse!(names).map(&:sort_order).join(' and ')
|
140
136
|
rescue => e
|
141
137
|
warn e.message
|
142
138
|
hash
|
143
139
|
end
|
144
|
-
|
145
|
-
def tokenize_names(names)
|
146
|
-
s, n, ns, cc = StringScanner.new(names), '', [], 0
|
147
|
-
until s.eos?
|
148
|
-
case
|
149
|
-
when s.scan(/,?\s*(and\b|&|;)/)
|
150
|
-
ns << n
|
151
|
-
n, cc = '', 0
|
152
|
-
when s.scan(/\s+/)
|
153
|
-
n << ' '
|
154
|
-
when s.scan(/,?\s*(jr|sr|ph\.?d|m\.?d|esq)\.?/i)
|
155
|
-
n << s.matched
|
156
|
-
when s.scan(/,/)
|
157
|
-
if cc > 0 || (n =~ /\S{2,}\s+\S{2,}/ && s.rest !~ /^\s*[[:alpha:]]+(\.|,|$)/)
|
158
|
-
ns << n
|
159
|
-
n, cc = '', 0
|
160
|
-
else
|
161
|
-
n << s.matched
|
162
|
-
cc += 1
|
163
|
-
end
|
164
|
-
when s.scan(/[[:alpha:]]+/)
|
165
|
-
n << s.matched
|
166
|
-
when s.scan(/./)
|
167
|
-
n << s.matched
|
168
|
-
end
|
169
|
-
end
|
170
|
-
ns << n
|
171
|
-
end
|
172
|
-
|
140
|
+
|
173
141
|
def normalize_title(hash)
|
174
142
|
title, container = hash[:title]
|
175
143
|
|
data/lib/anystyle/parser.rb
CHANGED
@@ -6,46 +6,46 @@ module Anystyle
|
|
6
6
|
describe "#tokenize_names" do
|
7
7
|
|
8
8
|
it "tokenizes 'A B'" do
|
9
|
-
Normalizer.instance.
|
9
|
+
Normalizer.instance.normalize_names('A B').should == 'B, A'
|
10
10
|
end
|
11
11
|
|
12
12
|
it "tokenizes 'A, B'" do
|
13
|
-
Normalizer.instance.
|
13
|
+
Normalizer.instance.normalize_names('A, B').should == 'A, B'
|
14
14
|
end
|
15
15
|
|
16
|
-
it "tokenizes 'A, jr., B'" do
|
17
|
-
|
18
|
-
end
|
19
|
-
|
20
|
-
it "tokenizes 'A, B, jr.'" do
|
21
|
-
|
22
|
-
end
|
16
|
+
# it "tokenizes 'A, jr., B'" do
|
17
|
+
# Normalizer.instance.normalize_names('A, jr., B').should == 'A, jr., B'
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# it "tokenizes 'A, B, jr.'" do
|
21
|
+
# Normalizer.instance.normalize_names('A, B, jr.').should == 'A, B, jr.'
|
22
|
+
# end
|
23
23
|
|
24
24
|
it "tokenizes 'A, B, C, D'" do
|
25
|
-
Normalizer.instance.
|
25
|
+
Normalizer.instance.normalize_names('A, B, C, D').should == 'A, B and C, D'
|
26
26
|
end
|
27
27
|
|
28
28
|
it "tokenizes 'A, B, C'" do
|
29
|
-
Normalizer.instance.
|
29
|
+
Normalizer.instance.normalize_names('A, B, C').should == 'A, B and C'
|
30
30
|
end
|
31
31
|
|
32
32
|
it "tokenizes 'Aa Bb, C.'" do
|
33
|
-
Normalizer.instance.
|
33
|
+
Normalizer.instance.normalize_names('Aa Bb, C.').should == 'Aa Bb, C.'
|
34
34
|
end
|
35
35
|
|
36
36
|
it "tokenizes 'Aa Bb, Cc Dd, and E F G'" do
|
37
|
-
Normalizer.instance.
|
37
|
+
Normalizer.instance.normalize_names('Aa Bb, C D, and E F G').should == 'Bb, Aa and D, C and G, E F'
|
38
38
|
end
|
39
39
|
|
40
40
|
[
|
41
|
-
['Poe, Edgar A.',
|
42
|
-
['Edgar A. Poe',
|
43
|
-
['Edgar A. Poe, Herman Melville',
|
44
|
-
['Poe, Edgar A., Melville, Herman',
|
45
|
-
['Aeschlimann Magnin, E.',
|
46
|
-
].each do |name,
|
41
|
+
['Poe, Edgar A.', 'Poe, Edgar A.'],
|
42
|
+
['Edgar A. Poe', 'Poe, Edgar A.'],
|
43
|
+
['Edgar A. Poe, Herman Melville', 'Poe, Edgar A. and Melville, Herman'],
|
44
|
+
['Poe, Edgar A., Melville, Herman', 'Poe, Edgar A. and Melville, Herman'],
|
45
|
+
['Aeschlimann Magnin, E.', 'Aeschlimann Magnin, E.']
|
46
|
+
].each do |name, normalized|
|
47
47
|
it "tokenizes #{name.inspect}" do
|
48
|
-
Normalizer.instance.
|
48
|
+
Normalizer.instance.normalize_names(name).should == normalized
|
49
49
|
end
|
50
50
|
end
|
51
51
|
|
data/spec/spec_helper.rb
CHANGED
@@ -1 +1,24 @@
|
|
1
|
+
begin
|
2
|
+
require 'simplecov'
|
3
|
+
rescue LoadError
|
4
|
+
# ignore
|
5
|
+
end
|
6
|
+
|
7
|
+
begin
|
8
|
+
require 'debugger'
|
9
|
+
rescue LoadError
|
10
|
+
# ignore
|
11
|
+
end
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'rspec'
|
1
16
|
require 'anystyle/parser'
|
17
|
+
|
18
|
+
# Requires supporting files with custom matchers and macros, etc,
|
19
|
+
# in ./support/ and its subdirectories.
|
20
|
+
Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
|
21
|
+
|
22
|
+
RSpec.configure do |config|
|
23
|
+
|
24
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anystyle-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-10-29 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bibtex-ruby
|
16
|
-
requirement:
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,73 +21,44 @@ dependencies:
|
|
21
21
|
version: '2.0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
25
|
-
- !ruby/object:Gem::Dependency
|
26
|
-
name: wapiti
|
27
|
-
requirement: &70180181145980 !ruby/object:Gem::Requirement
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
28
25
|
none: false
|
29
26
|
requirements:
|
30
27
|
- - ~>
|
31
28
|
- !ruby/object:Gem::Version
|
32
|
-
version: '
|
33
|
-
type: :runtime
|
34
|
-
prerelease: false
|
35
|
-
version_requirements: *70180181145980
|
36
|
-
- !ruby/object:Gem::Dependency
|
37
|
-
name: rake
|
38
|
-
requirement: &70180181145180 !ruby/object:Gem::Requirement
|
39
|
-
none: false
|
40
|
-
requirements:
|
41
|
-
- - ~>
|
42
|
-
- !ruby/object:Gem::Version
|
43
|
-
version: '0.9'
|
44
|
-
type: :development
|
45
|
-
prerelease: false
|
46
|
-
version_requirements: *70180181145180
|
29
|
+
version: '2.0'
|
47
30
|
- !ruby/object:Gem::Dependency
|
48
|
-
name:
|
49
|
-
requirement:
|
31
|
+
name: wapiti
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
50
33
|
none: false
|
51
34
|
requirements:
|
52
35
|
- - ~>
|
53
36
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
55
|
-
type: :
|
37
|
+
version: '0.0'
|
38
|
+
type: :runtime
|
56
39
|
prerelease: false
|
57
|
-
version_requirements:
|
58
|
-
- !ruby/object:Gem::Dependency
|
59
|
-
name: cucumber
|
60
|
-
requirement: &70180181143520 !ruby/object:Gem::Requirement
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
61
41
|
none: false
|
62
42
|
requirements:
|
63
43
|
- - ~>
|
64
44
|
- !ruby/object:Gem::Version
|
65
|
-
version: '
|
66
|
-
type: :development
|
67
|
-
prerelease: false
|
68
|
-
version_requirements: *70180181143520
|
45
|
+
version: '0.0'
|
69
46
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
71
|
-
requirement:
|
47
|
+
name: namae
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
72
49
|
none: false
|
73
50
|
requirements:
|
74
51
|
- - ~>
|
75
52
|
- !ruby/object:Gem::Version
|
76
|
-
version: '
|
77
|
-
type: :
|
53
|
+
version: '0.7'
|
54
|
+
type: :runtime
|
78
55
|
prerelease: false
|
79
|
-
version_requirements:
|
80
|
-
- !ruby/object:Gem::Dependency
|
81
|
-
name: ZenTest
|
82
|
-
requirement: &70180181142000 !ruby/object:Gem::Requirement
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
83
57
|
none: false
|
84
58
|
requirements:
|
85
59
|
- - ~>
|
86
60
|
- !ruby/object:Gem::Version
|
87
|
-
version: '
|
88
|
-
type: :development
|
89
|
-
prerelease: false
|
90
|
-
version_requirements: *70180181142000
|
61
|
+
version: '0.7'
|
91
62
|
description: A sophisticated parser for academic references based on machine learning
|
92
63
|
algorithms using conditional random fields.
|
93
64
|
email:
|
@@ -107,6 +78,7 @@ files:
|
|
107
78
|
- README.md
|
108
79
|
- Rakefile
|
109
80
|
- anystyle-parser.gemspec
|
81
|
+
- cucumber.yml
|
110
82
|
- features/step_definitions/parser_steps.rb
|
111
83
|
- features/support/env.rb
|
112
84
|
- lib/anystyle/parser.rb
|
@@ -148,7 +120,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
148
120
|
version: '0'
|
149
121
|
segments:
|
150
122
|
- 0
|
151
|
-
hash: -
|
123
|
+
hash: -4454145497019098220
|
152
124
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
153
125
|
none: false
|
154
126
|
requirements:
|
@@ -157,10 +129,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
157
129
|
version: '0'
|
158
130
|
segments:
|
159
131
|
- 0
|
160
|
-
hash: -
|
132
|
+
hash: -4454145497019098220
|
161
133
|
requirements: []
|
162
134
|
rubyforge_project:
|
163
|
-
rubygems_version: 1.8.
|
135
|
+
rubygems_version: 1.8.24
|
164
136
|
signing_key:
|
165
137
|
specification_version: 3
|
166
138
|
summary: Parser for academic references.
|
@@ -174,3 +146,4 @@ test_files:
|
|
174
146
|
- spec/benchmark.rb
|
175
147
|
- spec/profile.rb
|
176
148
|
- spec/spec_helper.rb
|
149
|
+
has_rdoc:
|