arxivsync 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/arxivsync.gemspec +2 -1
- data/lib/arxivsync/parser.rb +38 -11
- data/lib/arxivsync/version.rb +1 -1
- data/lib/arxivsync.rb +2 -0
- data/test/parser_test.rb +24 -3
- metadata +20 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9125d2dc0e6a12de5a4ec015f45d12801189090f
|
4
|
+
data.tar.gz: 71ad86f611dc9153ee668580078840109e2d8394
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8fc09072898e94ce1903df950d87fc4ab2a3ef737355f014afc473ca9e28aa6f154c74e7622be6f0097fbd2284d76645c7bbbcbf2b7a0c61ecd7faa75463cbf2
|
7
|
+
data.tar.gz: f19f459fa163cac5044457fc9aa88355cd6d9faecc04bbe4cb67f28b411fbef0ef55c892c47a26c8b97811fa8b9237e8194f3ea4b63f1be39c20dcfce57145c0
|
data/README.md
CHANGED
data/arxivsync.gemspec
CHANGED
@@ -25,7 +25,8 @@ Gem::Specification.new do |spec|
|
|
25
25
|
|
26
26
|
spec.add_runtime_dependency "oai"
|
27
27
|
spec.add_runtime_dependency "colorize"
|
28
|
-
spec.add_runtime_dependency "
|
28
|
+
spec.add_runtime_dependency "htmlentities", "~> 4.3.1"
|
29
|
+
spec.add_runtime_dependency "latex-decode", "~> 0.1.1"
|
29
30
|
spec.add_runtime_dependency "ox", ">= 2.0.2" # Super-fast XML parser
|
30
31
|
spec.add_runtime_dependency "nokogiri" # Slower but more accurate parser
|
31
32
|
end
|
data/lib/arxivsync/parser.rb
CHANGED
@@ -27,6 +27,10 @@ module ArxivSync
|
|
27
27
|
class XMLParser < ::Ox::Sax
|
28
28
|
attr_accessor :papers
|
29
29
|
|
30
|
+
def initialize
|
31
|
+
@entities = HTMLEntities.new
|
32
|
+
end
|
33
|
+
|
30
34
|
def start_element(name, attributes=[])
|
31
35
|
@el = name
|
32
36
|
case name
|
@@ -34,7 +38,7 @@ module ArxivSync
|
|
34
38
|
@papers = []
|
35
39
|
when :metadata
|
36
40
|
@model = Paper.new
|
37
|
-
@versions = []
|
41
|
+
@model.versions = []
|
38
42
|
when :version
|
39
43
|
@version = Version.new
|
40
44
|
end
|
@@ -44,27 +48,52 @@ module ArxivSync
|
|
44
48
|
str.gsub(/\s+/, ' ').strip
|
45
49
|
end
|
46
50
|
|
51
|
+
def decode(string)
|
52
|
+
str = @entities.decode(string)
|
53
|
+
LaTeX::Decode::Base.normalize(str)
|
54
|
+
LaTeX::Decode::Accents.decode!(str)
|
55
|
+
LaTeX::Decode::Diacritics.decode!(str)
|
56
|
+
LaTeX::Decode::Symbols.decode!(str)
|
57
|
+
str
|
58
|
+
end
|
59
|
+
|
47
60
|
def text(str)
|
48
61
|
case @el
|
49
62
|
# Necessary elements
|
50
63
|
when :id
|
51
64
|
@model.id = clean(str)
|
52
65
|
when :submitter
|
53
|
-
@model.submitter = clean(str)
|
66
|
+
@model.submitter = decode(clean(str))
|
54
67
|
when :title
|
55
|
-
@model.title = clean(str)
|
68
|
+
@model.title = decode(clean(str))
|
56
69
|
when :authors
|
57
|
-
|
58
|
-
|
59
|
-
|
70
|
+
# Author strings may contain strange metadata
|
71
|
+
# Non-regex parsing to handle nested parens
|
72
|
+
depth = 0
|
73
|
+
no_parens = ""
|
74
|
+
|
75
|
+
str.chars do |ch|
|
76
|
+
case ch
|
77
|
+
when '('
|
78
|
+
depth += 1
|
79
|
+
when ')'
|
80
|
+
depth -= 1
|
81
|
+
else
|
82
|
+
no_parens << ch if depth == 0
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
@model.authors = clean(no_parens).split(/,| and /)
|
87
|
+
.map { |s| decode(clean(s)) }
|
88
|
+
.reject { |s| s.empty? }
|
60
89
|
when :categories
|
61
90
|
@model.categories = clean(str).split(/\s/)
|
62
91
|
when :abstract
|
63
|
-
@model.abstract = clean(str)
|
92
|
+
@model.abstract = decode(clean(str))
|
64
93
|
|
65
94
|
# Optional elements
|
66
95
|
when :comments
|
67
|
-
@model.comments = clean(str)
|
96
|
+
@model.comments = decode(clean(str))
|
68
97
|
when :"msc-class"
|
69
98
|
@model.msc_class = clean(str)
|
70
99
|
when :"report-no"
|
@@ -89,10 +118,8 @@ module ArxivSync
|
|
89
118
|
def end_element(name)
|
90
119
|
case name
|
91
120
|
when :version
|
92
|
-
@versions.push(@version)
|
121
|
+
@model.versions.push(@version)
|
93
122
|
when :metadata # End of a paper entry
|
94
|
-
@model.versions = @versions
|
95
|
-
|
96
123
|
@papers.push(@model)
|
97
124
|
end
|
98
125
|
@el = nil
|
data/lib/arxivsync/version.rb
CHANGED
data/lib/arxivsync.rb
CHANGED
data/test/parser_test.rb
CHANGED
@@ -48,9 +48,30 @@ class TestParser < Minitest::Test
|
|
48
48
|
tested += 1
|
49
49
|
end
|
50
50
|
|
51
|
-
|
52
|
-
|
53
|
-
assert_equal "
|
51
|
+
# Ensure we handle TeX special characters
|
52
|
+
if paper.id == "0801.3763"
|
53
|
+
assert_equal "Dijana Žilić", paper.authors[1]
|
54
|
+
|
55
|
+
# But make sure we didn't try to parse any
|
56
|
+
# complex math-- that cannot be unicode
|
57
|
+
assert_includes paper.abstract, "[Cu(bpy)$_3$]$_2$[Cr(C$_2$O$_4$)$_3$]NO$_3⋅$9H$_2$O"
|
58
|
+
end
|
59
|
+
|
60
|
+
# Ensure we parse html entities
|
61
|
+
if paper.id == "0801.3778"
|
62
|
+
assert_equal "6 pages, 10 figures, to appear in \"Young massive clusters, initial conditions and environments\", typo in author's name corrected", paper.comments
|
63
|
+
elsif paper.id == "0801.3789"
|
64
|
+
assert_includes paper.abstract, "The addition of this \"conservative noise\" allows"
|
65
|
+
end
|
66
|
+
|
67
|
+
# And weird author strings
|
68
|
+
if paper.id == "0801.3898"
|
69
|
+
assert_equal ["A. Frasca", "Zs. Kovari", "K.G. Strassmeier", "K. Biazzo"], paper.authors
|
70
|
+
end
|
71
|
+
|
72
|
+
# And those pesky "and"s
|
73
|
+
if paper.id == "0801.3674"
|
74
|
+
assert_equal ["Robert H. Brandenberger", "Keshav Dasgupta", "Anne-Christine Davis"], paper.authors
|
54
75
|
end
|
55
76
|
end
|
56
77
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arxivsync
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaiden Mispy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-01-
|
11
|
+
date: 2014-01-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -94,20 +94,34 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: htmlentities
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: 4.3.1
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 4.3.1
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: latex-decode
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
|
-
- - "
|
115
|
+
- - "~>"
|
102
116
|
- !ruby/object:Gem::Version
|
103
|
-
version:
|
117
|
+
version: 0.1.1
|
104
118
|
type: :runtime
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
|
-
- - "
|
122
|
+
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
|
-
version:
|
124
|
+
version: 0.1.1
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: ox
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|