arxivsync 0.0.9 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/arxivsync/parser.rb +46 -8
- data/lib/arxivsync/version.rb +1 -1
- data/test/parser_test.rb +14 -13
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dddf2ff2529dee6f0d6638d7f58096c9a775b9b0
|
4
|
+
data.tar.gz: 624f0ef9a81a0c3cb3990736181eaef64c845caa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 107a2b4336920bf2a134e7a4ca04091b596199f9c984f84b3a3440062dc1f02d37e5e0d244735550ebea7000a6bfd6b607fe0e423544b892a35d964153b47539
|
7
|
+
data.tar.gz: e27c824d500aa07108813cd14e1fadc6b78c308f2c2302ec1e9983392efd89ac4137d8021ccb73492b447f2bb4bfc34f3e0fa370ef20c71d806ab57e7d0c3c8b
|
data/README.md
CHANGED
data/lib/arxivsync/parser.rb
CHANGED
@@ -10,6 +10,7 @@ module ArxivSync
|
|
10
10
|
:submitter, # 'N. C. Bacalis'
|
11
11
|
:versions,
|
12
12
|
:title, # "Variational Functionals for Excited States"
|
13
|
+
:author_str, # "Naoum C. Bacalis"
|
13
14
|
:authors, # ['Naoum C. Bacalis']
|
14
15
|
:categories, # ['quant-ph'] (primary category first, then crosslists)
|
15
16
|
:abstract, # "Functionals that have local minima at the excited..."
|
@@ -48,13 +49,48 @@ module ArxivSync
|
|
48
49
|
str.gsub(/\s+/, ' ').strip
|
49
50
|
end
|
50
51
|
|
52
|
+
# Like LaTeX.decode but without the punctuation weirdness
|
53
|
+
def latex_decode(str)
|
54
|
+
string = str.dup
|
55
|
+
|
56
|
+
LaTeX::Decode::Base.normalize(string)
|
57
|
+
|
58
|
+
LaTeX::Decode::Maths.decode!(string)
|
59
|
+
|
60
|
+
LaTeX::Decode::Accents.decode!(string)
|
61
|
+
LaTeX::Decode::Diacritics.decode!(string)
|
62
|
+
#LaTeX::Decode::Punctuation.decode!(string)
|
63
|
+
LaTeX::Decode::Symbols.decode!(string)
|
64
|
+
|
65
|
+
LaTeX::Decode::Base.strip_braces(string)
|
66
|
+
|
67
|
+
LaTeX.normalize_C(string)
|
68
|
+
end
|
69
|
+
|
51
70
|
def decode(string)
|
52
71
|
str = @entities.decode(string)
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
72
|
+
|
73
|
+
# Process latex entities -- except inside equations
|
74
|
+
decoded = ""
|
75
|
+
equation = false
|
76
|
+
segment = ""
|
77
|
+
str.chars do |ch|
|
78
|
+
if ch == '$'
|
79
|
+
if !equation
|
80
|
+
decoded << latex_decode(segment)
|
81
|
+
segment = ch
|
82
|
+
else
|
83
|
+
decoded << segment + ch
|
84
|
+
segment = ""
|
85
|
+
end
|
86
|
+
|
87
|
+
equation = !equation
|
88
|
+
else
|
89
|
+
segment << ch
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
decoded << latex_decode(segment)
|
58
94
|
end
|
59
95
|
|
60
96
|
def text(str)
|
@@ -69,10 +105,12 @@ module ArxivSync
|
|
69
105
|
when :authors
|
70
106
|
# Author strings may contain strange metadata
|
71
107
|
# Non-regex parsing to handle nested parens
|
108
|
+
@model.author_str = decode(clean(str))
|
109
|
+
|
72
110
|
depth = 0
|
73
111
|
no_parens = ""
|
74
112
|
|
75
|
-
|
113
|
+
@model.author_str.chars do |ch|
|
76
114
|
case ch
|
77
115
|
when '('
|
78
116
|
depth += 1
|
@@ -83,8 +121,8 @@ module ArxivSync
|
|
83
121
|
end
|
84
122
|
end
|
85
123
|
|
86
|
-
@model.authors =
|
87
|
-
.map { |s|
|
124
|
+
@model.authors = no_parens.split(/,|:|;|\sand\s|\s?the\s/i)
|
125
|
+
.map { |s| clean(s) }
|
88
126
|
.reject { |s| s.empty? }
|
89
127
|
when :categories
|
90
128
|
@model.categories = clean(str).split(/\s/)
|
data/lib/arxivsync/version.rb
CHANGED
data/test/parser_test.rb
CHANGED
@@ -13,19 +13,20 @@ class TestParser < Minitest::Test
|
|
13
13
|
assert_equal papers.count, 1000
|
14
14
|
papers.each do |paper|
|
15
15
|
if paper.id == '0801.3673'
|
16
|
-
assert_equal
|
16
|
+
assert_equal "N. C. Bacalis", paper.submitter
|
17
17
|
|
18
|
-
assert_equal paper.versions.length
|
19
|
-
assert_equal
|
20
|
-
assert_equal paper.versions[0].size
|
18
|
+
assert_equal 1, paper.versions.length
|
19
|
+
assert_equal Time.parse("Wed, 23 Jan 2008 21:06:41 GMT"), paper.versions[0].date
|
20
|
+
assert_equal "121kb", paper.versions[0].size
|
21
21
|
|
22
|
-
assert_equal
|
22
|
+
assert_equal "Variational Functionals for Excited States", paper.title
|
23
|
+
assert_equal "Naoum C. Bacalis", paper.author_str
|
23
24
|
|
24
|
-
assert_equal
|
25
|
-
assert_equal
|
25
|
+
assert_equal ["Naoum C. Bacalis"], paper.authors
|
26
|
+
assert_equal ["quant-ph"], paper.categories
|
26
27
|
|
27
|
-
assert_equal
|
28
|
-
assert_equal
|
28
|
+
assert_equal "4 pages", paper.comments
|
29
|
+
assert_equal "Functionals that have local minima at the excited states of a non degenerate Hamiltonian are presented. Then, improved mutually orthogonal approximants of the ground and the first excited state are reported.", paper.abstract
|
29
30
|
tested += 1
|
30
31
|
end
|
31
32
|
|
@@ -39,8 +40,8 @@ class TestParser < Minitest::Test
|
|
39
40
|
assert_equal paper.versions[1].size, "58kb"
|
40
41
|
|
41
42
|
assert_equal paper.title, "Weak Localization of Dirac Fermions in Graphene"
|
42
|
-
assert_equal
|
43
|
-
assert_equal
|
43
|
+
assert_equal ["Xin-Zhong Yan", "C. S. Ting"], paper.authors
|
44
|
+
assert_equal ["cond-mat.str-el"], paper.categories
|
44
45
|
assert_equal paper.comments, "4 pages, 4 figures"
|
45
46
|
assert_equal paper.journal_ref, "PRL 101, 126801 (2008)"
|
46
47
|
assert_equal paper.doi, "10.1103/PhysRevLett.101.126801"
|
@@ -50,11 +51,11 @@ class TestParser < Minitest::Test
|
|
50
51
|
|
51
52
|
# Ensure we handle TeX special characters
|
52
53
|
if paper.id == "0801.3763"
|
53
|
-
assert_equal "Dijana
|
54
|
+
assert_equal "Dijana Žilić", paper.authors[1]
|
54
55
|
|
55
56
|
# But make sure we didn't try to parse any
|
56
57
|
# complex math-- that cannot be unicode
|
57
|
-
assert_includes paper.abstract, "[Cu(bpy)$_3$]$_2$[Cr(C$_2$O$_4$)$_3$]NO$_3
|
58
|
+
assert_includes paper.abstract, "[Cu(bpy)$_3$]$_2$[Cr(C$_2$O$_4$)$_3$]NO$_3\\cdot $9H$_2$O"
|
58
59
|
end
|
59
60
|
|
60
61
|
# Ensure we parse html entities
|