arxivsync 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/arxivsync.gemspec +2 -1
- data/lib/arxivsync/parser.rb +38 -11
- data/lib/arxivsync/version.rb +1 -1
- data/lib/arxivsync.rb +2 -0
- data/test/parser_test.rb +24 -3
- metadata +20 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9125d2dc0e6a12de5a4ec015f45d12801189090f
|
4
|
+
data.tar.gz: 71ad86f611dc9153ee668580078840109e2d8394
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8fc09072898e94ce1903df950d87fc4ab2a3ef737355f014afc473ca9e28aa6f154c74e7622be6f0097fbd2284d76645c7bbbcbf2b7a0c61ecd7faa75463cbf2
|
7
|
+
data.tar.gz: f19f459fa163cac5044457fc9aa88355cd6d9faecc04bbe4cb67f28b411fbef0ef55c892c47a26c8b97811fa8b9237e8194f3ea4b63f1be39c20dcfce57145c0
|
data/README.md
CHANGED
data/arxivsync.gemspec
CHANGED
@@ -25,7 +25,8 @@ Gem::Specification.new do |spec|
|
|
25
25
|
|
26
26
|
spec.add_runtime_dependency "oai"
|
27
27
|
spec.add_runtime_dependency "colorize"
|
28
|
-
spec.add_runtime_dependency "
|
28
|
+
spec.add_runtime_dependency "htmlentities", "~> 4.3.1"
|
29
|
+
spec.add_runtime_dependency "latex-decode", "~> 0.1.1"
|
29
30
|
spec.add_runtime_dependency "ox", ">= 2.0.2" # Super-fast XML parser
|
30
31
|
spec.add_runtime_dependency "nokogiri" # Slower but more accurate parser
|
31
32
|
end
|
data/lib/arxivsync/parser.rb
CHANGED
@@ -27,6 +27,10 @@ module ArxivSync
|
|
27
27
|
class XMLParser < ::Ox::Sax
|
28
28
|
attr_accessor :papers
|
29
29
|
|
30
|
+
def initialize
|
31
|
+
@entities = HTMLEntities.new
|
32
|
+
end
|
33
|
+
|
30
34
|
def start_element(name, attributes=[])
|
31
35
|
@el = name
|
32
36
|
case name
|
@@ -34,7 +38,7 @@ module ArxivSync
|
|
34
38
|
@papers = []
|
35
39
|
when :metadata
|
36
40
|
@model = Paper.new
|
37
|
-
@versions = []
|
41
|
+
@model.versions = []
|
38
42
|
when :version
|
39
43
|
@version = Version.new
|
40
44
|
end
|
@@ -44,27 +48,52 @@ module ArxivSync
|
|
44
48
|
str.gsub(/\s+/, ' ').strip
|
45
49
|
end
|
46
50
|
|
51
|
+
def decode(string)
|
52
|
+
str = @entities.decode(string)
|
53
|
+
LaTeX::Decode::Base.normalize(str)
|
54
|
+
LaTeX::Decode::Accents.decode!(str)
|
55
|
+
LaTeX::Decode::Diacritics.decode!(str)
|
56
|
+
LaTeX::Decode::Symbols.decode!(str)
|
57
|
+
str
|
58
|
+
end
|
59
|
+
|
47
60
|
def text(str)
|
48
61
|
case @el
|
49
62
|
# Necessary elements
|
50
63
|
when :id
|
51
64
|
@model.id = clean(str)
|
52
65
|
when :submitter
|
53
|
-
@model.submitter = clean(str)
|
66
|
+
@model.submitter = decode(clean(str))
|
54
67
|
when :title
|
55
|
-
@model.title = clean(str)
|
68
|
+
@model.title = decode(clean(str))
|
56
69
|
when :authors
|
57
|
-
|
58
|
-
|
59
|
-
|
70
|
+
# Author strings may contain strange metadata
|
71
|
+
# Non-regex parsing to handle nested parens
|
72
|
+
depth = 0
|
73
|
+
no_parens = ""
|
74
|
+
|
75
|
+
str.chars do |ch|
|
76
|
+
case ch
|
77
|
+
when '('
|
78
|
+
depth += 1
|
79
|
+
when ')'
|
80
|
+
depth -= 1
|
81
|
+
else
|
82
|
+
no_parens << ch if depth == 0
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
@model.authors = clean(no_parens).split(/,| and /)
|
87
|
+
.map { |s| decode(clean(s)) }
|
88
|
+
.reject { |s| s.empty? }
|
60
89
|
when :categories
|
61
90
|
@model.categories = clean(str).split(/\s/)
|
62
91
|
when :abstract
|
63
|
-
@model.abstract = clean(str)
|
92
|
+
@model.abstract = decode(clean(str))
|
64
93
|
|
65
94
|
# Optional elements
|
66
95
|
when :comments
|
67
|
-
@model.comments = clean(str)
|
96
|
+
@model.comments = decode(clean(str))
|
68
97
|
when :"msc-class"
|
69
98
|
@model.msc_class = clean(str)
|
70
99
|
when :"report-no"
|
@@ -89,10 +118,8 @@ module ArxivSync
|
|
89
118
|
def end_element(name)
|
90
119
|
case name
|
91
120
|
when :version
|
92
|
-
@versions.push(@version)
|
121
|
+
@model.versions.push(@version)
|
93
122
|
when :metadata # End of a paper entry
|
94
|
-
@model.versions = @versions
|
95
|
-
|
96
123
|
@papers.push(@model)
|
97
124
|
end
|
98
125
|
@el = nil
|
data/lib/arxivsync/version.rb
CHANGED
data/lib/arxivsync.rb
CHANGED
data/test/parser_test.rb
CHANGED
@@ -48,9 +48,30 @@ class TestParser < Minitest::Test
|
|
48
48
|
tested += 1
|
49
49
|
end
|
50
50
|
|
51
|
-
|
52
|
-
|
53
|
-
assert_equal "
|
51
|
+
# Ensure we handle TeX special characters
|
52
|
+
if paper.id == "0801.3763"
|
53
|
+
assert_equal "Dijana Žilić", paper.authors[1]
|
54
|
+
|
55
|
+
# But make sure we didn't try to parse any
|
56
|
+
# complex math-- that cannot be unicode
|
57
|
+
assert_includes paper.abstract, "[Cu(bpy)$_3$]$_2$[Cr(C$_2$O$_4$)$_3$]NO$_3⋅$9H$_2$O"
|
58
|
+
end
|
59
|
+
|
60
|
+
# Ensure we parse html entities
|
61
|
+
if paper.id == "0801.3778"
|
62
|
+
assert_equal "6 pages, 10 figures, to appear in \"Young massive clusters, initial conditions and environments\", typo in author's name corrected", paper.comments
|
63
|
+
elsif paper.id == "0801.3789"
|
64
|
+
assert_includes paper.abstract, "The addition of this \"conservative noise\" allows"
|
65
|
+
end
|
66
|
+
|
67
|
+
# And weird author strings
|
68
|
+
if paper.id == "0801.3898"
|
69
|
+
assert_equal ["A. Frasca", "Zs. Kovari", "K.G. Strassmeier", "K. Biazzo"], paper.authors
|
70
|
+
end
|
71
|
+
|
72
|
+
# And those pesky "and"s
|
73
|
+
if paper.id == "0801.3674"
|
74
|
+
assert_equal ["Robert H. Brandenberger", "Keshav Dasgupta", "Anne-Christine Davis"], paper.authors
|
54
75
|
end
|
55
76
|
end
|
56
77
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arxivsync
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaiden Mispy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-01-
|
11
|
+
date: 2014-01-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -94,20 +94,34 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: htmlentities
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: 4.3.1
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 4.3.1
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: latex-decode
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
|
-
- - "
|
115
|
+
- - "~>"
|
102
116
|
- !ruby/object:Gem::Version
|
103
|
-
version:
|
117
|
+
version: 0.1.1
|
104
118
|
type: :runtime
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
|
-
- - "
|
122
|
+
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
|
-
version:
|
124
|
+
version: 0.1.1
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: ox
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|