arxivsync 0.0.9 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/arxivsync/parser.rb +46 -8
- data/lib/arxivsync/version.rb +1 -1
- data/test/parser_test.rb +14 -13
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dddf2ff2529dee6f0d6638d7f58096c9a775b9b0
|
4
|
+
data.tar.gz: 624f0ef9a81a0c3cb3990736181eaef64c845caa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 107a2b4336920bf2a134e7a4ca04091b596199f9c984f84b3a3440062dc1f02d37e5e0d244735550ebea7000a6bfd6b607fe0e423544b892a35d964153b47539
|
7
|
+
data.tar.gz: e27c824d500aa07108813cd14e1fadc6b78c308f2c2302ec1e9983392efd89ac4137d8021ccb73492b447f2bb4bfc34f3e0fa370ef20c71d806ab57e7d0c3c8b
|
data/README.md
CHANGED
data/lib/arxivsync/parser.rb
CHANGED
@@ -10,6 +10,7 @@ module ArxivSync
|
|
10
10
|
:submitter, # 'N. C. Bacalis'
|
11
11
|
:versions,
|
12
12
|
:title, # "Variational Functionals for Excited States"
|
13
|
+
:author_str, # "Naoum C. Bacalis"
|
13
14
|
:authors, # ['Naoum C. Bacalis']
|
14
15
|
:categories, # ['quant-ph'] (primary category first, then crosslists)
|
15
16
|
:abstract, # "Functionals that have local minima at the excited..."
|
@@ -48,13 +49,48 @@ module ArxivSync
|
|
48
49
|
str.gsub(/\s+/, ' ').strip
|
49
50
|
end
|
50
51
|
|
52
|
+
# Like LaTeX.decode but without the punctuation weirdness
|
53
|
+
def latex_decode(str)
|
54
|
+
string = str.dup
|
55
|
+
|
56
|
+
LaTeX::Decode::Base.normalize(string)
|
57
|
+
|
58
|
+
LaTeX::Decode::Maths.decode!(string)
|
59
|
+
|
60
|
+
LaTeX::Decode::Accents.decode!(string)
|
61
|
+
LaTeX::Decode::Diacritics.decode!(string)
|
62
|
+
#LaTeX::Decode::Punctuation.decode!(string)
|
63
|
+
LaTeX::Decode::Symbols.decode!(string)
|
64
|
+
|
65
|
+
LaTeX::Decode::Base.strip_braces(string)
|
66
|
+
|
67
|
+
LaTeX.normalize_C(string)
|
68
|
+
end
|
69
|
+
|
51
70
|
def decode(string)
|
52
71
|
str = @entities.decode(string)
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
72
|
+
|
73
|
+
# Process latex entities -- except inside equations
|
74
|
+
decoded = ""
|
75
|
+
equation = false
|
76
|
+
segment = ""
|
77
|
+
str.chars do |ch|
|
78
|
+
if ch == '$'
|
79
|
+
if !equation
|
80
|
+
decoded << latex_decode(segment)
|
81
|
+
segment = ch
|
82
|
+
else
|
83
|
+
decoded << segment + ch
|
84
|
+
segment = ""
|
85
|
+
end
|
86
|
+
|
87
|
+
equation = !equation
|
88
|
+
else
|
89
|
+
segment << ch
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
decoded << latex_decode(segment)
|
58
94
|
end
|
59
95
|
|
60
96
|
def text(str)
|
@@ -69,10 +105,12 @@ module ArxivSync
|
|
69
105
|
when :authors
|
70
106
|
# Author strings may contain strange metadata
|
71
107
|
# Non-regex parsing to handle nested parens
|
108
|
+
@model.author_str = decode(clean(str))
|
109
|
+
|
72
110
|
depth = 0
|
73
111
|
no_parens = ""
|
74
112
|
|
75
|
-
|
113
|
+
@model.author_str.chars do |ch|
|
76
114
|
case ch
|
77
115
|
when '('
|
78
116
|
depth += 1
|
@@ -83,8 +121,8 @@ module ArxivSync
|
|
83
121
|
end
|
84
122
|
end
|
85
123
|
|
86
|
-
@model.authors =
|
87
|
-
.map { |s|
|
124
|
+
@model.authors = no_parens.split(/,|:|;|\sand\s|\s?the\s/i)
|
125
|
+
.map { |s| clean(s) }
|
88
126
|
.reject { |s| s.empty? }
|
89
127
|
when :categories
|
90
128
|
@model.categories = clean(str).split(/\s/)
|
data/lib/arxivsync/version.rb
CHANGED
data/test/parser_test.rb
CHANGED
@@ -13,19 +13,20 @@ class TestParser < Minitest::Test
|
|
13
13
|
assert_equal papers.count, 1000
|
14
14
|
papers.each do |paper|
|
15
15
|
if paper.id == '0801.3673'
|
16
|
-
assert_equal
|
16
|
+
assert_equal "N. C. Bacalis", paper.submitter
|
17
17
|
|
18
|
-
assert_equal paper.versions.length
|
19
|
-
assert_equal
|
20
|
-
assert_equal paper.versions[0].size
|
18
|
+
assert_equal 1, paper.versions.length
|
19
|
+
assert_equal Time.parse("Wed, 23 Jan 2008 21:06:41 GMT"), paper.versions[0].date
|
20
|
+
assert_equal "121kb", paper.versions[0].size
|
21
21
|
|
22
|
-
assert_equal
|
22
|
+
assert_equal "Variational Functionals for Excited States", paper.title
|
23
|
+
assert_equal "Naoum C. Bacalis", paper.author_str
|
23
24
|
|
24
|
-
assert_equal
|
25
|
-
assert_equal
|
25
|
+
assert_equal ["Naoum C. Bacalis"], paper.authors
|
26
|
+
assert_equal ["quant-ph"], paper.categories
|
26
27
|
|
27
|
-
assert_equal
|
28
|
-
assert_equal
|
28
|
+
assert_equal "4 pages", paper.comments
|
29
|
+
assert_equal "Functionals that have local minima at the excited states of a non degenerate Hamiltonian are presented. Then, improved mutually orthogonal approximants of the ground and the first excited state are reported.", paper.abstract
|
29
30
|
tested += 1
|
30
31
|
end
|
31
32
|
|
@@ -39,8 +40,8 @@ class TestParser < Minitest::Test
|
|
39
40
|
assert_equal paper.versions[1].size, "58kb"
|
40
41
|
|
41
42
|
assert_equal paper.title, "Weak Localization of Dirac Fermions in Graphene"
|
42
|
-
assert_equal
|
43
|
-
assert_equal
|
43
|
+
assert_equal ["Xin-Zhong Yan", "C. S. Ting"], paper.authors
|
44
|
+
assert_equal ["cond-mat.str-el"], paper.categories
|
44
45
|
assert_equal paper.comments, "4 pages, 4 figures"
|
45
46
|
assert_equal paper.journal_ref, "PRL 101, 126801 (2008)"
|
46
47
|
assert_equal paper.doi, "10.1103/PhysRevLett.101.126801"
|
@@ -50,11 +51,11 @@ class TestParser < Minitest::Test
|
|
50
51
|
|
51
52
|
# Ensure we handle TeX special characters
|
52
53
|
if paper.id == "0801.3763"
|
53
|
-
assert_equal "Dijana
|
54
|
+
assert_equal "Dijana Žilić", paper.authors[1]
|
54
55
|
|
55
56
|
# But make sure we didn't try to parse any
|
56
57
|
# complex math-- that cannot be unicode
|
57
|
-
assert_includes paper.abstract, "[Cu(bpy)$_3$]$_2$[Cr(C$_2$O$_4$)$_3$]NO$_3
|
58
|
+
assert_includes paper.abstract, "[Cu(bpy)$_3$]$_2$[Cr(C$_2$O$_4$)$_3$]NO$_3\\cdot $9H$_2$O"
|
58
59
|
end
|
59
60
|
|
60
61
|
# Ensure we parse html entities
|