anystyle-parser 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +18 -2
- data/Rakefile +18 -0
- data/lib/anystyle/parser/normalizer.rb +5 -3
- data/lib/anystyle/parser/version.rb +1 -1
- data/spec/anystyle/parser/normalizer_spec.rb +20 -0
- metadata +17 -16
data/README.md
CHANGED
@@ -29,7 +29,7 @@ recommended to install Kyoto Cabinet and the `kyotocabinet-ruby` gem.
|
|
29
29
|
The database file will be created the first time you access the dictionary;
|
30
30
|
note that you will need write permissions in the directory where the file
|
31
31
|
is to be created. You can change the Dictionary's default path in the
|
32
|
-
|
32
|
+
Dictionary's options:
|
33
33
|
|
34
34
|
Anystyle::Parser::Dictionary.instance.options[:path]
|
35
35
|
|
@@ -72,7 +72,17 @@ The following irb sessions illustrates some parser goodness:
|
|
72
72
|
=> "Jorge"
|
73
73
|
> b[0].author.to_s
|
74
74
|
=> "Liu, Dong C. and Nocedal, Jorge"
|
75
|
-
|
75
|
+
> puts Anystyle.parse('Auster, Paul. The Art of Hunger. Expanded. New York: Penguin, 1997.', :bibtex).to_s
|
76
|
+
@book{2162008820,
|
77
|
+
author = {Auster, Paul},
|
78
|
+
title = {The Art of Hunger},
|
79
|
+
location = {New York},
|
80
|
+
publisher = {Penguin},
|
81
|
+
edition = {Expanded},
|
82
|
+
year = {1997}
|
83
|
+
}
|
84
|
+
=> nil
|
85
|
+
|
76
86
|
### Unhappy with the results?
|
77
87
|
|
78
88
|
Citation references come in many forms, so, inevitably, you will find data
|
@@ -117,6 +127,8 @@ data again:
|
|
117
127
|
> Anystyle.parse 'John Lafferty, Andrew McCallum, and Fernando Pereira. 2001. Conditional random fields: probabilistic models for segmenting and labeling sequence data. In Proceedings of the International Conference on Machine Learning, pages 282-289. Morgan Kaufmann, San Francisco, CA.'
|
118
128
|
=> [{:author=>"John Lafferty and Andrew McCallum and Fernando Pereira", :title=>"Conditional random fields: probabilistic models for segmenting and labeling sequence data", :booktitle=>"Proceedings of the International Conference on Machine Learning", :pages=>"282--289", :publisher=>"Morgan Kaufmann", :location=>"San Francisco, CA", :year=>2001, :type=>:inproceedings}]
|
119
129
|
|
130
|
+
If you want to make Anystyle-Parser smarter, please consider sending us your
|
131
|
+
tagged references (see below).
|
120
132
|
|
121
133
|
Contributing
|
122
134
|
------------
|
@@ -132,6 +144,10 @@ If you've found a bug or have a question, please open an issue on the
|
|
132
144
|
Or, for extra credit, clone the Anystyle-Parser repository, write a failing
|
133
145
|
example, fix the bug and submit a pull request.
|
134
146
|
|
147
|
+
If you want to contribute tagged references, please either add them to
|
148
|
+
`resources/train.txt` or create a new file in the `resources` directory
|
149
|
+
and open a pull request on GitHub.
|
150
|
+
|
135
151
|
|
136
152
|
License
|
137
153
|
-------
|
data/Rakefile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
lib = File.expand_path('../lib/', __FILE__)
|
2
|
+
$:.unshift lib unless $:.include?(lib)
|
3
|
+
|
4
|
+
require 'rake/clean'
|
5
|
+
|
6
|
+
require 'anystyle/parser/version'
|
7
|
+
|
8
|
+
task :build => [:clean] do
|
9
|
+
system 'gem build anystyle-parser.gemspec'
|
10
|
+
end
|
11
|
+
|
12
|
+
task :release => [:build] do
|
13
|
+
system "git tag #{Anystyle::Parser::VERSION}"
|
14
|
+
system "gem push anystyle-parser-#{Anystyle::Parser::VERSION}.gem"
|
15
|
+
end
|
16
|
+
|
17
|
+
CLEAN.include('*.gem')
|
18
|
+
CLEAN.include('*.rbc')
|
@@ -136,7 +136,7 @@ module Anystyle
|
|
136
136
|
s, n, ns, cc = StringScanner.new(names), '', [], 0
|
137
137
|
until s.eos?
|
138
138
|
case
|
139
|
-
when s.scan(/,?\s*and\b
|
139
|
+
when s.scan(/,?\s*(and\b|&)/)
|
140
140
|
ns << n
|
141
141
|
n, cc = '', 0
|
142
142
|
when s.scan(/\s+/)
|
@@ -144,14 +144,16 @@ module Anystyle
|
|
144
144
|
when s.scan(/,?\s*(jr|sr|ph\.?d|m\.?d|esq)\.?/i)
|
145
145
|
n << s.matched
|
146
146
|
when s.scan(/,/)
|
147
|
-
if cc > 0 || n =~ /\
|
147
|
+
if cc > 0 || (n =~ /\S{2,}\s+\S{2,}/ && s.rest !~ /^\s*\w+(\.|,|$)/)
|
148
148
|
ns << n
|
149
149
|
n, cc = '', 0
|
150
150
|
else
|
151
151
|
n << s.matched
|
152
152
|
cc += 1
|
153
153
|
end
|
154
|
-
when s.scan(/\w+/)
|
154
|
+
when s.scan(/\w+/)
|
155
|
+
n << s.matched
|
156
|
+
when s.scan(/./)
|
155
157
|
n << s.matched
|
156
158
|
end
|
157
159
|
end
|
@@ -29,6 +29,26 @@ module Anystyle
|
|
29
29
|
Normalizer.instance.tokenize_names('A, B, C').should == ['A, B', ' C']
|
30
30
|
end
|
31
31
|
|
32
|
+
it "tokenizes 'Aa Bb, C.'" do
|
33
|
+
Normalizer.instance.tokenize_names('Aa Bb, C.').should == ['Aa Bb, C.']
|
34
|
+
end
|
35
|
+
|
36
|
+
it "tokenizes 'Aa Bb, Cc Dd, and E F G'" do
|
37
|
+
Normalizer.instance.tokenize_names('Aa Bb, C D, and E F G').should == ['Aa Bb', ' C D', ' E F G']
|
38
|
+
end
|
39
|
+
|
40
|
+
[
|
41
|
+
['Poe, Edgar A.', ['Poe, Edgar A.']],
|
42
|
+
['Edgar A. Poe', ['Edgar A. Poe']],
|
43
|
+
['Edgar A. Poe, Herman Melville', ['Edgar A. Poe', ' Herman Melville']],
|
44
|
+
['Poe, Edgar A., Melville, Herman', ['Poe, Edgar A.', ' Melville, Herman']],
|
45
|
+
['Aeschlimann Magnin, E.', ['Aeschlimann Magnin, E.']]
|
46
|
+
].each do |name, tokens|
|
47
|
+
it "tokenizes #{name.inspect}" do
|
48
|
+
Normalizer.instance.tokenize_names(name).should == tokens
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
32
52
|
end
|
33
53
|
end
|
34
54
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anystyle-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-09-
|
12
|
+
date: 2011-09-06 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bibtex-ruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &2153531180 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '1.3'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2153531180
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: wapiti
|
27
|
-
requirement: &
|
27
|
+
requirement: &2153529340 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0.0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2153529340
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: rake
|
38
|
-
requirement: &
|
38
|
+
requirement: &2153528780 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0.9'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2153528780
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: racc
|
49
|
-
requirement: &
|
49
|
+
requirement: &2153528020 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '1.4'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2153528020
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: cucumber
|
60
|
-
requirement: &
|
60
|
+
requirement: &2153526200 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '1.0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2153526200
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rspec
|
71
|
-
requirement: &
|
71
|
+
requirement: &2153524800 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '2.6'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *2153524800
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: ZenTest
|
82
|
-
requirement: &
|
82
|
+
requirement: &2153523460 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ~>
|
@@ -87,7 +87,7 @@ dependencies:
|
|
87
87
|
version: '4.6'
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *2153523460
|
91
91
|
description: A sophisticated parser for academic references based on conditional random
|
92
92
|
fields.
|
93
93
|
email:
|
@@ -105,6 +105,7 @@ files:
|
|
105
105
|
- HISTORY.md
|
106
106
|
- LICENSE
|
107
107
|
- README.md
|
108
|
+
- Rakefile
|
108
109
|
- anystyle-parser.gemspec
|
109
110
|
- features/step_definitions/parser_steps.rb
|
110
111
|
- features/support/env.rb
|