anystyle-parser 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -1,14 +1,19 @@
1
1
  source :rubygems
2
2
  gemspec
3
3
 
4
- group :debug do
5
- gem 'ruby-debug19', :require => 'ruby-debug', :platforms => [:mri_19]
6
- gem 'ruby-debug', :platforms => [:mri_18, :jruby]
7
- gem 'rbx-trepanning', :platforms => [:rbx]
4
+ group :development do
5
+ gem 'debugger', :platforms => [:mri_19]
6
+ gem 'simplecov'
7
+ gem 'yard'
8
8
  end
9
9
 
10
- group :osx_test do
11
- gem 'autotest-fsevent', :require => false
10
+ group :test do
11
+ gem 'rake'
12
+ gem 'racc', '~>1.4'
13
+
14
+ gem 'cucumber'
15
+ gem 'rspec'
16
+ gem 'ZenTest'
12
17
  end
13
18
 
14
19
  group :profile do
@@ -16,6 +21,7 @@ group :profile do
16
21
  gem 'gnuplot'
17
22
  end
18
23
 
19
- group :kyotocabinet do
24
+ group :extra do
20
25
  gem 'kyotocabinet-ruby', :require => 'kyotocabinet'
26
+ gem 'autotest-fsevent', :require => false
21
27
  end
data/HISTORY.md CHANGED
@@ -1,3 +1,7 @@
1
+ 0.2.0 / 2012-10-29
2
+ ==================
3
+ * Use Namae for name normalization
4
+
1
5
  0.1.1 / 2012-03-29
2
6
  ==================
3
7
  * Bugfix
data/README.md CHANGED
@@ -5,7 +5,7 @@ Anystyle-Parser is a very fast and smart parser for academic references. It
5
5
  is inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/) and
6
6
  [FreeCite](http://freecite.library.brown.edu/); Anystyle-Parser uses machine
7
7
  learning algorithms and is designed
8
- for raw speed (it [wapiti](https://github.com/inukshuk/wapiti-ruby) based
8
+ for raw speed (it uses [wapiti](https://github.com/inukshuk/wapiti-ruby) based
9
9
  conditional random fields and [Kyoto Cabinet](http://fallabs.com/kyotocabinet/)
10
10
  or [Redis](http://redis.io) as a key-value store), flexibility (it is easy to
11
11
  train the model with data that is relevant to your parsing needs), and
data/Rakefile CHANGED
@@ -1,10 +1,19 @@
1
- lib = File.expand_path('../lib/', __FILE__)
2
- $:.unshift lib unless $:.include?(lib)
3
-
1
+ require 'bundler'
2
+ begin
3
+ Bundler.setup(:default, :development, :debug, :test, :extra, :profile)
4
+ rescue Bundler::BundlerError => e
5
+ $stderr.puts e.message
6
+ $stderr.puts "Run `bundle install` to install missing gems"
7
+ exit e.status_code
8
+ end
9
+ require 'rake'
4
10
  require 'rake/clean'
5
11
 
12
+ $:.unshift(File.join(File.dirname(__FILE__), './lib'))
13
+
6
14
  require 'anystyle/parser/version'
7
15
 
16
+ task :default
8
17
  task :build => [:clean] do
9
18
  system 'gem build anystyle-parser.gemspec'
10
19
  end
@@ -14,5 +23,34 @@ task :release => [:build] do
14
23
  system "gem push anystyle-parser-#{Anystyle::Parser::VERSION}.gem"
15
24
  end
16
25
 
26
+ require 'rspec/core'
27
+ require 'rspec/core/rake_task'
28
+ RSpec::Core::RakeTask.new(:spec) do |spec|
29
+ spec.pattern = FileList['spec/**/*_spec.rb']
30
+ end
31
+
32
+ require 'cucumber/rake/task'
33
+ Cucumber::Rake::Task.new(:features)
34
+
35
+ task :default => :spec
36
+
37
+ begin
38
+ require 'yard'
39
+ YARD::Rake::YardocTask.new
40
+ rescue LoadError
41
+ # ignore
42
+ end
43
+
44
+ desc 'Run an IRB session with CSL loaded'
45
+ task :console, [:script] do |t, args|
46
+ ARGV.clear
47
+
48
+ require 'irb'
49
+ require 'anystyle/parser'
50
+
51
+ IRB.conf[:SCRIPT] = args.script
52
+ IRB.start
53
+ end
54
+
17
55
  CLEAN.include('*.gem')
18
56
  CLEAN.include('*.rbc')
@@ -17,12 +17,7 @@ Gem::Specification.new do |s|
17
17
 
18
18
  s.add_runtime_dependency('bibtex-ruby', '~>2.0')
19
19
  s.add_runtime_dependency('wapiti', '~>0.0')
20
-
21
- s.add_development_dependency('rake', ['~>0.9'])
22
- s.add_development_dependency('racc', ['~>1.4'])
23
- s.add_development_dependency('cucumber', ['~>1.0'])
24
- s.add_development_dependency('rspec', ['~>2.6'])
25
- s.add_development_dependency('ZenTest', ['~>4.6'])
20
+ s.add_runtime_dependency('namae', '~>0.7')
26
21
 
27
22
  s.files = `git ls-files`.split("\n") - Dir['resources/**/*']
28
23
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
data/cucumber.yml ADDED
@@ -0,0 +1 @@
1
+ default: --format progress --require features --color
@@ -129,47 +129,15 @@ module Anystyle
129
129
  hash
130
130
  end
131
131
 
132
+ Namae::Parser.instance.options[:prefer_comma_as_separator] = true
133
+
132
134
  def normalize_names(names)
133
- names = tokenize_names(names).map do |name|
134
- name.strip!
135
- name.gsub!(/\b([[:upper:]]{2,3})\b/) { $1.split(//).join(' ') }
136
- name.gsub!(/\b([[:upper:]])(\s|$)/) { [$1, $2 == ?. ? nil : ?., $2].compact.join }
137
- name
138
- end
139
- names.join(' and ')
135
+ Namae.parse!(names).map(&:sort_order).join(' and ')
140
136
  rescue => e
141
137
  warn e.message
142
138
  hash
143
139
  end
144
-
145
- def tokenize_names(names)
146
- s, n, ns, cc = StringScanner.new(names), '', [], 0
147
- until s.eos?
148
- case
149
- when s.scan(/,?\s*(and\b|&|;)/)
150
- ns << n
151
- n, cc = '', 0
152
- when s.scan(/\s+/)
153
- n << ' '
154
- when s.scan(/,?\s*(jr|sr|ph\.?d|m\.?d|esq)\.?/i)
155
- n << s.matched
156
- when s.scan(/,/)
157
- if cc > 0 || (n =~ /\S{2,}\s+\S{2,}/ && s.rest !~ /^\s*[[:alpha:]]+(\.|,|$)/)
158
- ns << n
159
- n, cc = '', 0
160
- else
161
- n << s.matched
162
- cc += 1
163
- end
164
- when s.scan(/[[:alpha:]]+/)
165
- n << s.matched
166
- when s.scan(/./)
167
- n << s.matched
168
- end
169
- end
170
- ns << n
171
- end
172
-
140
+
173
141
  def normalize_title(hash)
174
142
  title, container = hash[:title]
175
143
 
@@ -1,5 +1,5 @@
1
1
  module Anystyle
2
2
  module Parser
3
- VERSION = '0.1.1'.freeze
3
+ VERSION = '0.2.0'.freeze
4
4
  end
5
5
  end
@@ -3,9 +3,7 @@ require 'singleton'
3
3
 
4
4
  require 'bibtex'
5
5
  require 'wapiti'
6
-
7
- # require 'ruby-debug'
8
- # Debugger.start
6
+ require 'namae'
9
7
 
10
8
  require 'anystyle/parser/errors'
11
9
 
@@ -6,46 +6,46 @@ module Anystyle
6
6
  describe "#tokenize_names" do
7
7
 
8
8
  it "tokenizes 'A B'" do
9
- Normalizer.instance.tokenize_names('A B').should == ['A B']
9
+ Normalizer.instance.normalize_names('A B').should == 'B, A'
10
10
  end
11
11
 
12
12
  it "tokenizes 'A, B'" do
13
- Normalizer.instance.tokenize_names('A, B').should == ['A, B']
13
+ Normalizer.instance.normalize_names('A, B').should == 'A, B'
14
14
  end
15
15
 
16
- it "tokenizes 'A, jr., B'" do
17
- Normalizer.instance.tokenize_names('A, jr., B').should == ['A, jr., B']
18
- end
19
-
20
- it "tokenizes 'A, B, jr.'" do
21
- Normalizer.instance.tokenize_names('A, B, jr.').should == ['A, B, jr.']
22
- end
16
+ # it "tokenizes 'A, jr., B'" do
17
+ # Normalizer.instance.normalize_names('A, jr., B').should == 'A, jr., B'
18
+ # end
19
+ #
20
+ # it "tokenizes 'A, B, jr.'" do
21
+ # Normalizer.instance.normalize_names('A, B, jr.').should == 'A, B, jr.'
22
+ # end
23
23
 
24
24
  it "tokenizes 'A, B, C, D'" do
25
- Normalizer.instance.tokenize_names('A, B, C, D').should == ['A, B', ' C, D']
25
+ Normalizer.instance.normalize_names('A, B, C, D').should == 'A, B and C, D'
26
26
  end
27
27
 
28
28
  it "tokenizes 'A, B, C'" do
29
- Normalizer.instance.tokenize_names('A, B, C').should == ['A, B', ' C']
29
+ Normalizer.instance.normalize_names('A, B, C').should == 'A, B and C'
30
30
  end
31
31
 
32
32
  it "tokenizes 'Aa Bb, C.'" do
33
- Normalizer.instance.tokenize_names('Aa Bb, C.').should == ['Aa Bb, C.']
33
+ Normalizer.instance.normalize_names('Aa Bb, C.').should == 'Aa Bb, C.'
34
34
  end
35
35
 
36
36
  it "tokenizes 'Aa Bb, Cc Dd, and E F G'" do
37
- Normalizer.instance.tokenize_names('Aa Bb, C D, and E F G').should == ['Aa Bb', ' C D', ' E F G']
37
+ Normalizer.instance.normalize_names('Aa Bb, C D, and E F G').should == 'Bb, Aa and D, C and G, E F'
38
38
  end
39
39
 
40
40
  [
41
- ['Poe, Edgar A.', ['Poe, Edgar A.']],
42
- ['Edgar A. Poe', ['Edgar A. Poe']],
43
- ['Edgar A. Poe, Herman Melville', ['Edgar A. Poe', ' Herman Melville']],
44
- ['Poe, Edgar A., Melville, Herman', ['Poe, Edgar A.', ' Melville, Herman']],
45
- ['Aeschlimann Magnin, E.', ['Aeschlimann Magnin, E.']]
46
- ].each do |name, tokens|
41
+ ['Poe, Edgar A.', 'Poe, Edgar A.'],
42
+ ['Edgar A. Poe', 'Poe, Edgar A.'],
43
+ ['Edgar A. Poe, Herman Melville', 'Poe, Edgar A. and Melville, Herman'],
44
+ ['Poe, Edgar A., Melville, Herman', 'Poe, Edgar A. and Melville, Herman'],
45
+ ['Aeschlimann Magnin, E.', 'Aeschlimann Magnin, E.']
46
+ ].each do |name, normalized|
47
47
  it "tokenizes #{name.inspect}" do
48
- Normalizer.instance.tokenize_names(name).should == tokens
48
+ Normalizer.instance.normalize_names(name).should == normalized
49
49
  end
50
50
  end
51
51
 
data/spec/spec_helper.rb CHANGED
@@ -1 +1,24 @@
1
+ begin
2
+ require 'simplecov'
3
+ rescue LoadError
4
+ # ignore
5
+ end
6
+
7
+ begin
8
+ require 'debugger'
9
+ rescue LoadError
10
+ # ignore
11
+ end
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'rspec'
1
16
  require 'anystyle/parser'
17
+
18
+ # Requires supporting files with custom matchers and macros, etc,
19
+ # in ./support/ and its subdirectories.
20
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
21
+
22
+ RSpec.configure do |config|
23
+
24
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anystyle-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-29 00:00:00.000000000 Z
12
+ date: 2012-10-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bibtex-ruby
16
- requirement: &70180181117220 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,73 +21,44 @@ dependencies:
21
21
  version: '2.0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70180181117220
25
- - !ruby/object:Gem::Dependency
26
- name: wapiti
27
- requirement: &70180181145980 !ruby/object:Gem::Requirement
24
+ version_requirements: !ruby/object:Gem::Requirement
28
25
  none: false
29
26
  requirements:
30
27
  - - ~>
31
28
  - !ruby/object:Gem::Version
32
- version: '0.0'
33
- type: :runtime
34
- prerelease: false
35
- version_requirements: *70180181145980
36
- - !ruby/object:Gem::Dependency
37
- name: rake
38
- requirement: &70180181145180 !ruby/object:Gem::Requirement
39
- none: false
40
- requirements:
41
- - - ~>
42
- - !ruby/object:Gem::Version
43
- version: '0.9'
44
- type: :development
45
- prerelease: false
46
- version_requirements: *70180181145180
29
+ version: '2.0'
47
30
  - !ruby/object:Gem::Dependency
48
- name: racc
49
- requirement: &70180181144260 !ruby/object:Gem::Requirement
31
+ name: wapiti
32
+ requirement: !ruby/object:Gem::Requirement
50
33
  none: false
51
34
  requirements:
52
35
  - - ~>
53
36
  - !ruby/object:Gem::Version
54
- version: '1.4'
55
- type: :development
37
+ version: '0.0'
38
+ type: :runtime
56
39
  prerelease: false
57
- version_requirements: *70180181144260
58
- - !ruby/object:Gem::Dependency
59
- name: cucumber
60
- requirement: &70180181143520 !ruby/object:Gem::Requirement
40
+ version_requirements: !ruby/object:Gem::Requirement
61
41
  none: false
62
42
  requirements:
63
43
  - - ~>
64
44
  - !ruby/object:Gem::Version
65
- version: '1.0'
66
- type: :development
67
- prerelease: false
68
- version_requirements: *70180181143520
45
+ version: '0.0'
69
46
  - !ruby/object:Gem::Dependency
70
- name: rspec
71
- requirement: &70180181142760 !ruby/object:Gem::Requirement
47
+ name: namae
48
+ requirement: !ruby/object:Gem::Requirement
72
49
  none: false
73
50
  requirements:
74
51
  - - ~>
75
52
  - !ruby/object:Gem::Version
76
- version: '2.6'
77
- type: :development
53
+ version: '0.7'
54
+ type: :runtime
78
55
  prerelease: false
79
- version_requirements: *70180181142760
80
- - !ruby/object:Gem::Dependency
81
- name: ZenTest
82
- requirement: &70180181142000 !ruby/object:Gem::Requirement
56
+ version_requirements: !ruby/object:Gem::Requirement
83
57
  none: false
84
58
  requirements:
85
59
  - - ~>
86
60
  - !ruby/object:Gem::Version
87
- version: '4.6'
88
- type: :development
89
- prerelease: false
90
- version_requirements: *70180181142000
61
+ version: '0.7'
91
62
  description: A sophisticated parser for academic references based on machine learning
92
63
  algorithms using conditional random fields.
93
64
  email:
@@ -107,6 +78,7 @@ files:
107
78
  - README.md
108
79
  - Rakefile
109
80
  - anystyle-parser.gemspec
81
+ - cucumber.yml
110
82
  - features/step_definitions/parser_steps.rb
111
83
  - features/support/env.rb
112
84
  - lib/anystyle/parser.rb
@@ -148,7 +120,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
148
120
  version: '0'
149
121
  segments:
150
122
  - 0
151
- hash: -3694608752673104144
123
+ hash: -4454145497019098220
152
124
  required_rubygems_version: !ruby/object:Gem::Requirement
153
125
  none: false
154
126
  requirements:
@@ -157,10 +129,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
157
129
  version: '0'
158
130
  segments:
159
131
  - 0
160
- hash: -3694608752673104144
132
+ hash: -4454145497019098220
161
133
  requirements: []
162
134
  rubyforge_project:
163
- rubygems_version: 1.8.10
135
+ rubygems_version: 1.8.24
164
136
  signing_key:
165
137
  specification_version: 3
166
138
  summary: Parser for academic references.
@@ -174,3 +146,4 @@ test_files:
174
146
  - spec/benchmark.rb
175
147
  - spec/profile.rb
176
148
  - spec/spec_helper.rb
149
+ has_rdoc: