anystyle-parser 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -1,14 +1,19 @@
1
1
  source :rubygems
2
2
  gemspec
3
3
 
4
- group :debug do
5
- gem 'ruby-debug19', :require => 'ruby-debug', :platforms => [:mri_19]
6
- gem 'ruby-debug', :platforms => [:mri_18, :jruby]
7
- gem 'rbx-trepanning', :platforms => [:rbx]
4
+ group :development do
5
+ gem 'debugger', :platforms => [:mri_19]
6
+ gem 'simplecov'
7
+ gem 'yard'
8
8
  end
9
9
 
10
- group :osx_test do
11
- gem 'autotest-fsevent', :require => false
10
+ group :test do
11
+ gem 'rake'
12
+ gem 'racc', '~>1.4'
13
+
14
+ gem 'cucumber'
15
+ gem 'rspec'
16
+ gem 'ZenTest'
12
17
  end
13
18
 
14
19
  group :profile do
@@ -16,6 +21,7 @@ group :profile do
16
21
  gem 'gnuplot'
17
22
  end
18
23
 
19
- group :kyotocabinet do
24
+ group :extra do
20
25
  gem 'kyotocabinet-ruby', :require => 'kyotocabinet'
26
+ gem 'autotest-fsevent', :require => false
21
27
  end
data/HISTORY.md CHANGED
@@ -1,3 +1,7 @@
1
+ 0.2.0 / 2012-10-29
2
+ ==================
3
+ * Use Namae for name normalization
4
+
1
5
  0.1.1 / 2012-03-29
2
6
  ==================
3
7
  * Bugfix
data/README.md CHANGED
@@ -5,7 +5,7 @@ Anystyle-Parser is a very fast and smart parser for academic references. It
5
5
  is inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/) and
6
6
  [FreeCite](http://freecite.library.brown.edu/); Anystyle-Parser uses machine
7
7
  learning algorithms and is designed
8
- for raw speed (it [wapiti](https://github.com/inukshuk/wapiti-ruby) based
8
+ for raw speed (it uses [wapiti](https://github.com/inukshuk/wapiti-ruby) based
9
9
  conditional random fields and [Kyoto Cabinet](http://fallabs.com/kyotocabinet/)
10
10
  or [Redis](http://redis.io) as a key-value store), flexibility (it is easy to
11
11
  train the model with data that is relevant to your parsing needs), and
data/Rakefile CHANGED
@@ -1,10 +1,19 @@
1
- lib = File.expand_path('../lib/', __FILE__)
2
- $:.unshift lib unless $:.include?(lib)
3
-
1
+ require 'bundler'
2
+ begin
3
+ Bundler.setup(:default, :development, :debug, :test, :extra, :profile)
4
+ rescue Bundler::BundlerError => e
5
+ $stderr.puts e.message
6
+ $stderr.puts "Run `bundle install` to install missing gems"
7
+ exit e.status_code
8
+ end
9
+ require 'rake'
4
10
  require 'rake/clean'
5
11
 
12
+ $:.unshift(File.join(File.dirname(__FILE__), './lib'))
13
+
6
14
  require 'anystyle/parser/version'
7
15
 
16
+ task :default
8
17
  task :build => [:clean] do
9
18
  system 'gem build anystyle-parser.gemspec'
10
19
  end
@@ -14,5 +23,34 @@ task :release => [:build] do
14
23
  system "gem push anystyle-parser-#{Anystyle::Parser::VERSION}.gem"
15
24
  end
16
25
 
26
+ require 'rspec/core'
27
+ require 'rspec/core/rake_task'
28
+ RSpec::Core::RakeTask.new(:spec) do |spec|
29
+ spec.pattern = FileList['spec/**/*_spec.rb']
30
+ end
31
+
32
+ require 'cucumber/rake/task'
33
+ Cucumber::Rake::Task.new(:features)
34
+
35
+ task :default => :spec
36
+
37
+ begin
38
+ require 'yard'
39
+ YARD::Rake::YardocTask.new
40
+ rescue LoadError
41
+ # ignore
42
+ end
43
+
44
+ desc 'Run an IRB session with CSL loaded'
45
+ task :console, [:script] do |t, args|
46
+ ARGV.clear
47
+
48
+ require 'irb'
49
+ require 'anystyle/parser'
50
+
51
+ IRB.conf[:SCRIPT] = args.script
52
+ IRB.start
53
+ end
54
+
17
55
  CLEAN.include('*.gem')
18
56
  CLEAN.include('*.rbc')
@@ -17,12 +17,7 @@ Gem::Specification.new do |s|
17
17
 
18
18
  s.add_runtime_dependency('bibtex-ruby', '~>2.0')
19
19
  s.add_runtime_dependency('wapiti', '~>0.0')
20
-
21
- s.add_development_dependency('rake', ['~>0.9'])
22
- s.add_development_dependency('racc', ['~>1.4'])
23
- s.add_development_dependency('cucumber', ['~>1.0'])
24
- s.add_development_dependency('rspec', ['~>2.6'])
25
- s.add_development_dependency('ZenTest', ['~>4.6'])
20
+ s.add_runtime_dependency('namae', '~>0.7')
26
21
 
27
22
  s.files = `git ls-files`.split("\n") - Dir['resources/**/*']
28
23
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
data/cucumber.yml ADDED
@@ -0,0 +1 @@
1
+ default: --format progress --require features --color
@@ -129,47 +129,15 @@ module Anystyle
129
129
  hash
130
130
  end
131
131
 
132
+ Namae::Parser.instance.options[:prefer_comma_as_separator] = true
133
+
132
134
  def normalize_names(names)
133
- names = tokenize_names(names).map do |name|
134
- name.strip!
135
- name.gsub!(/\b([[:upper:]]{2,3})\b/) { $1.split(//).join(' ') }
136
- name.gsub!(/\b([[:upper:]])(\s|$)/) { [$1, $2 == ?. ? nil : ?., $2].compact.join }
137
- name
138
- end
139
- names.join(' and ')
135
+ Namae.parse!(names).map(&:sort_order).join(' and ')
140
136
  rescue => e
141
137
  warn e.message
142
138
  hash
143
139
  end
144
-
145
- def tokenize_names(names)
146
- s, n, ns, cc = StringScanner.new(names), '', [], 0
147
- until s.eos?
148
- case
149
- when s.scan(/,?\s*(and\b|&|;)/)
150
- ns << n
151
- n, cc = '', 0
152
- when s.scan(/\s+/)
153
- n << ' '
154
- when s.scan(/,?\s*(jr|sr|ph\.?d|m\.?d|esq)\.?/i)
155
- n << s.matched
156
- when s.scan(/,/)
157
- if cc > 0 || (n =~ /\S{2,}\s+\S{2,}/ && s.rest !~ /^\s*[[:alpha:]]+(\.|,|$)/)
158
- ns << n
159
- n, cc = '', 0
160
- else
161
- n << s.matched
162
- cc += 1
163
- end
164
- when s.scan(/[[:alpha:]]+/)
165
- n << s.matched
166
- when s.scan(/./)
167
- n << s.matched
168
- end
169
- end
170
- ns << n
171
- end
172
-
140
+
173
141
  def normalize_title(hash)
174
142
  title, container = hash[:title]
175
143
 
@@ -1,5 +1,5 @@
1
1
  module Anystyle
2
2
  module Parser
3
- VERSION = '0.1.1'.freeze
3
+ VERSION = '0.2.0'.freeze
4
4
  end
5
5
  end
@@ -3,9 +3,7 @@ require 'singleton'
3
3
 
4
4
  require 'bibtex'
5
5
  require 'wapiti'
6
-
7
- # require 'ruby-debug'
8
- # Debugger.start
6
+ require 'namae'
9
7
 
10
8
  require 'anystyle/parser/errors'
11
9
 
@@ -6,46 +6,46 @@ module Anystyle
6
6
  describe "#tokenize_names" do
7
7
 
8
8
  it "tokenizes 'A B'" do
9
- Normalizer.instance.tokenize_names('A B').should == ['A B']
9
+ Normalizer.instance.normalize_names('A B').should == 'B, A'
10
10
  end
11
11
 
12
12
  it "tokenizes 'A, B'" do
13
- Normalizer.instance.tokenize_names('A, B').should == ['A, B']
13
+ Normalizer.instance.normalize_names('A, B').should == 'A, B'
14
14
  end
15
15
 
16
- it "tokenizes 'A, jr., B'" do
17
- Normalizer.instance.tokenize_names('A, jr., B').should == ['A, jr., B']
18
- end
19
-
20
- it "tokenizes 'A, B, jr.'" do
21
- Normalizer.instance.tokenize_names('A, B, jr.').should == ['A, B, jr.']
22
- end
16
+ # it "tokenizes 'A, jr., B'" do
17
+ # Normalizer.instance.normalize_names('A, jr., B').should == 'A, jr., B'
18
+ # end
19
+ #
20
+ # it "tokenizes 'A, B, jr.'" do
21
+ # Normalizer.instance.normalize_names('A, B, jr.').should == 'A, B, jr.'
22
+ # end
23
23
 
24
24
  it "tokenizes 'A, B, C, D'" do
25
- Normalizer.instance.tokenize_names('A, B, C, D').should == ['A, B', ' C, D']
25
+ Normalizer.instance.normalize_names('A, B, C, D').should == 'A, B and C, D'
26
26
  end
27
27
 
28
28
  it "tokenizes 'A, B, C'" do
29
- Normalizer.instance.tokenize_names('A, B, C').should == ['A, B', ' C']
29
+ Normalizer.instance.normalize_names('A, B, C').should == 'A, B and C'
30
30
  end
31
31
 
32
32
  it "tokenizes 'Aa Bb, C.'" do
33
- Normalizer.instance.tokenize_names('Aa Bb, C.').should == ['Aa Bb, C.']
33
+ Normalizer.instance.normalize_names('Aa Bb, C.').should == 'Aa Bb, C.'
34
34
  end
35
35
 
36
36
  it "tokenizes 'Aa Bb, Cc Dd, and E F G'" do
37
- Normalizer.instance.tokenize_names('Aa Bb, C D, and E F G').should == ['Aa Bb', ' C D', ' E F G']
37
+ Normalizer.instance.normalize_names('Aa Bb, C D, and E F G').should == 'Bb, Aa and D, C and G, E F'
38
38
  end
39
39
 
40
40
  [
41
- ['Poe, Edgar A.', ['Poe, Edgar A.']],
42
- ['Edgar A. Poe', ['Edgar A. Poe']],
43
- ['Edgar A. Poe, Herman Melville', ['Edgar A. Poe', ' Herman Melville']],
44
- ['Poe, Edgar A., Melville, Herman', ['Poe, Edgar A.', ' Melville, Herman']],
45
- ['Aeschlimann Magnin, E.', ['Aeschlimann Magnin, E.']]
46
- ].each do |name, tokens|
41
+ ['Poe, Edgar A.', 'Poe, Edgar A.'],
42
+ ['Edgar A. Poe', 'Poe, Edgar A.'],
43
+ ['Edgar A. Poe, Herman Melville', 'Poe, Edgar A. and Melville, Herman'],
44
+ ['Poe, Edgar A., Melville, Herman', 'Poe, Edgar A. and Melville, Herman'],
45
+ ['Aeschlimann Magnin, E.', 'Aeschlimann Magnin, E.']
46
+ ].each do |name, normalized|
47
47
  it "tokenizes #{name.inspect}" do
48
- Normalizer.instance.tokenize_names(name).should == tokens
48
+ Normalizer.instance.normalize_names(name).should == normalized
49
49
  end
50
50
  end
51
51
 
data/spec/spec_helper.rb CHANGED
@@ -1 +1,24 @@
1
+ begin
2
+ require 'simplecov'
3
+ rescue LoadError
4
+ # ignore
5
+ end
6
+
7
+ begin
8
+ require 'debugger'
9
+ rescue LoadError
10
+ # ignore
11
+ end
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'rspec'
1
16
  require 'anystyle/parser'
17
+
18
+ # Requires supporting files with custom matchers and macros, etc,
19
+ # in ./support/ and its subdirectories.
20
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
21
+
22
+ RSpec.configure do |config|
23
+
24
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anystyle-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-29 00:00:00.000000000 Z
12
+ date: 2012-10-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bibtex-ruby
16
- requirement: &70180181117220 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,73 +21,44 @@ dependencies:
21
21
  version: '2.0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70180181117220
25
- - !ruby/object:Gem::Dependency
26
- name: wapiti
27
- requirement: &70180181145980 !ruby/object:Gem::Requirement
24
+ version_requirements: !ruby/object:Gem::Requirement
28
25
  none: false
29
26
  requirements:
30
27
  - - ~>
31
28
  - !ruby/object:Gem::Version
32
- version: '0.0'
33
- type: :runtime
34
- prerelease: false
35
- version_requirements: *70180181145980
36
- - !ruby/object:Gem::Dependency
37
- name: rake
38
- requirement: &70180181145180 !ruby/object:Gem::Requirement
39
- none: false
40
- requirements:
41
- - - ~>
42
- - !ruby/object:Gem::Version
43
- version: '0.9'
44
- type: :development
45
- prerelease: false
46
- version_requirements: *70180181145180
29
+ version: '2.0'
47
30
  - !ruby/object:Gem::Dependency
48
- name: racc
49
- requirement: &70180181144260 !ruby/object:Gem::Requirement
31
+ name: wapiti
32
+ requirement: !ruby/object:Gem::Requirement
50
33
  none: false
51
34
  requirements:
52
35
  - - ~>
53
36
  - !ruby/object:Gem::Version
54
- version: '1.4'
55
- type: :development
37
+ version: '0.0'
38
+ type: :runtime
56
39
  prerelease: false
57
- version_requirements: *70180181144260
58
- - !ruby/object:Gem::Dependency
59
- name: cucumber
60
- requirement: &70180181143520 !ruby/object:Gem::Requirement
40
+ version_requirements: !ruby/object:Gem::Requirement
61
41
  none: false
62
42
  requirements:
63
43
  - - ~>
64
44
  - !ruby/object:Gem::Version
65
- version: '1.0'
66
- type: :development
67
- prerelease: false
68
- version_requirements: *70180181143520
45
+ version: '0.0'
69
46
  - !ruby/object:Gem::Dependency
70
- name: rspec
71
- requirement: &70180181142760 !ruby/object:Gem::Requirement
47
+ name: namae
48
+ requirement: !ruby/object:Gem::Requirement
72
49
  none: false
73
50
  requirements:
74
51
  - - ~>
75
52
  - !ruby/object:Gem::Version
76
- version: '2.6'
77
- type: :development
53
+ version: '0.7'
54
+ type: :runtime
78
55
  prerelease: false
79
- version_requirements: *70180181142760
80
- - !ruby/object:Gem::Dependency
81
- name: ZenTest
82
- requirement: &70180181142000 !ruby/object:Gem::Requirement
56
+ version_requirements: !ruby/object:Gem::Requirement
83
57
  none: false
84
58
  requirements:
85
59
  - - ~>
86
60
  - !ruby/object:Gem::Version
87
- version: '4.6'
88
- type: :development
89
- prerelease: false
90
- version_requirements: *70180181142000
61
+ version: '0.7'
91
62
  description: A sophisticated parser for academic references based on machine learning
92
63
  algorithms using conditional random fields.
93
64
  email:
@@ -107,6 +78,7 @@ files:
107
78
  - README.md
108
79
  - Rakefile
109
80
  - anystyle-parser.gemspec
81
+ - cucumber.yml
110
82
  - features/step_definitions/parser_steps.rb
111
83
  - features/support/env.rb
112
84
  - lib/anystyle/parser.rb
@@ -148,7 +120,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
148
120
  version: '0'
149
121
  segments:
150
122
  - 0
151
- hash: -3694608752673104144
123
+ hash: -4454145497019098220
152
124
  required_rubygems_version: !ruby/object:Gem::Requirement
153
125
  none: false
154
126
  requirements:
@@ -157,10 +129,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
157
129
  version: '0'
158
130
  segments:
159
131
  - 0
160
- hash: -3694608752673104144
132
+ hash: -4454145497019098220
161
133
  requirements: []
162
134
  rubyforge_project:
163
- rubygems_version: 1.8.10
135
+ rubygems_version: 1.8.24
164
136
  signing_key:
165
137
  specification_version: 3
166
138
  summary: Parser for academic references.
@@ -174,3 +146,4 @@ test_files:
174
146
  - spec/benchmark.rb
175
147
  - spec/profile.rb
176
148
  - spec/spec_helper.rb
149
+ has_rdoc: