arxivsync 0.0.9 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9125d2dc0e6a12de5a4ec015f45d12801189090f
4
- data.tar.gz: 71ad86f611dc9153ee668580078840109e2d8394
3
+ metadata.gz: dddf2ff2529dee6f0d6638d7f58096c9a775b9b0
4
+ data.tar.gz: 624f0ef9a81a0c3cb3990736181eaef64c845caa
5
5
  SHA512:
6
- metadata.gz: 8fc09072898e94ce1903df950d87fc4ab2a3ef737355f014afc473ca9e28aa6f154c74e7622be6f0097fbd2284d76645c7bbbcbf2b7a0c61ecd7faa75463cbf2
7
- data.tar.gz: f19f459fa163cac5044457fc9aa88355cd6d9faecc04bbe4cb67f28b411fbef0ef55c892c47a26c8b97811fa8b9237e8194f3ea4b63f1be39c20dcfce57145c0
6
+ metadata.gz: 107a2b4336920bf2a134e7a4ca04091b596199f9c984f84b3a3440062dc1f02d37e5e0d244735550ebea7000a6bfd6b607fe0e423544b892a35d964153b47539
7
+ data.tar.gz: e27c824d500aa07108813cd14e1fadc6b78c308f2c2302ec1e9983392efd89ac4137d8021ccb73492b447f2bb4bfc34f3e0fa370ef20c71d806ab57e7d0c3c8b
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # arxivsync 0.0.9
1
+ # arxivsync 0.1.0
2
2
 
3
3
  Ruby OAI interface for harvesting the arXiv. Can be used to store and update an XML mirror of paper metadata, and parse the XML into Ruby objects to allow conversion into a friendlier format.
4
4
 
@@ -10,6 +10,7 @@ module ArxivSync
10
10
  :submitter, # 'N. C. Bacalis'
11
11
  :versions,
12
12
  :title, # "Variational Functionals for Excited States"
13
+ :author_str, # "Naoum C. Bacalis"
13
14
  :authors, # ['Naoum C. Bacalis']
14
15
  :categories, # ['quant-ph'] (primary category first, then crosslists)
15
16
  :abstract, # "Functionals that have local minima at the excited..."
@@ -48,13 +49,48 @@ module ArxivSync
48
49
  str.gsub(/\s+/, ' ').strip
49
50
  end
50
51
 
52
+ # Like LaTeX.decode but without the punctuation weirdness
53
+ def latex_decode(str)
54
+ string = str.dup
55
+
56
+ LaTeX::Decode::Base.normalize(string)
57
+
58
+ LaTeX::Decode::Maths.decode!(string)
59
+
60
+ LaTeX::Decode::Accents.decode!(string)
61
+ LaTeX::Decode::Diacritics.decode!(string)
62
+ #LaTeX::Decode::Punctuation.decode!(string)
63
+ LaTeX::Decode::Symbols.decode!(string)
64
+
65
+ LaTeX::Decode::Base.strip_braces(string)
66
+
67
+ LaTeX.normalize_C(string)
68
+ end
69
+
51
70
  def decode(string)
52
71
  str = @entities.decode(string)
53
- LaTeX::Decode::Base.normalize(str)
54
- LaTeX::Decode::Accents.decode!(str)
55
- LaTeX::Decode::Diacritics.decode!(str)
56
- LaTeX::Decode::Symbols.decode!(str)
57
- str
72
+
73
+ # Process latex entities -- except inside equations
74
+ decoded = ""
75
+ equation = false
76
+ segment = ""
77
+ str.chars do |ch|
78
+ if ch == '$'
79
+ if !equation
80
+ decoded << latex_decode(segment)
81
+ segment = ch
82
+ else
83
+ decoded << segment + ch
84
+ segment = ""
85
+ end
86
+
87
+ equation = !equation
88
+ else
89
+ segment << ch
90
+ end
91
+ end
92
+
93
+ decoded << latex_decode(segment)
58
94
  end
59
95
 
60
96
  def text(str)
@@ -69,10 +105,12 @@ module ArxivSync
69
105
  when :authors
70
106
  # Author strings may contain strange metadata
71
107
  # Non-regex parsing to handle nested parens
108
+ @model.author_str = decode(clean(str))
109
+
72
110
  depth = 0
73
111
  no_parens = ""
74
112
 
75
- str.chars do |ch|
113
+ @model.author_str.chars do |ch|
76
114
  case ch
77
115
  when '('
78
116
  depth += 1
@@ -83,8 +121,8 @@ module ArxivSync
83
121
  end
84
122
  end
85
123
 
86
- @model.authors = clean(no_parens).split(/,| and /)
87
- .map { |s| decode(clean(s)) }
124
+ @model.authors = no_parens.split(/,|:|;|\sand\s|\s?the\s/i)
125
+ .map { |s| clean(s) }
88
126
  .reject { |s| s.empty? }
89
127
  when :categories
90
128
  @model.categories = clean(str).split(/\s/)
@@ -1,3 +1,3 @@
1
1
  module ArxivSync
2
- VERSION = "0.0.9"
2
+ VERSION = "0.1.0"
3
3
  end
data/test/parser_test.rb CHANGED
@@ -13,19 +13,20 @@ class TestParser < Minitest::Test
13
13
  assert_equal papers.count, 1000
14
14
  papers.each do |paper|
15
15
  if paper.id == '0801.3673'
16
- assert_equal paper.submitter, "N. C. Bacalis"
16
+ assert_equal "N. C. Bacalis", paper.submitter
17
17
 
18
- assert_equal paper.versions.length, 1
19
- assert_equal paper.versions[0].date, Time.parse("Wed, 23 Jan 2008 21:06:41 GMT")
20
- assert_equal paper.versions[0].size, "121kb"
18
+ assert_equal 1, paper.versions.length
19
+ assert_equal Time.parse("Wed, 23 Jan 2008 21:06:41 GMT"), paper.versions[0].date
20
+ assert_equal "121kb", paper.versions[0].size
21
21
 
22
- assert_equal paper.title, "Variational Functionals for Excited States"
22
+ assert_equal "Variational Functionals for Excited States", paper.title
23
+ assert_equal "Naoum C. Bacalis", paper.author_str
23
24
 
24
- assert_equal paper.authors, ["Naoum C. Bacalis"]
25
- assert_equal paper.categories, ["quant-ph"]
25
+ assert_equal ["Naoum C. Bacalis"], paper.authors
26
+ assert_equal ["quant-ph"], paper.categories
26
27
 
27
- assert_equal paper.comments, "4 pages"
28
- assert_equal paper.abstract, "Functionals that have local minima at the excited states of a non degenerate Hamiltonian are presented. Then, improved mutually orthogonal approximants of the ground and the first excited state are reported."
28
+ assert_equal "4 pages", paper.comments
29
+ assert_equal "Functionals that have local minima at the excited states of a non degenerate Hamiltonian are presented. Then, improved mutually orthogonal approximants of the ground and the first excited state are reported.", paper.abstract
29
30
  tested += 1
30
31
  end
31
32
 
@@ -39,8 +40,8 @@ class TestParser < Minitest::Test
39
40
  assert_equal paper.versions[1].size, "58kb"
40
41
 
41
42
  assert_equal paper.title, "Weak Localization of Dirac Fermions in Graphene"
42
- assert_equal paper.authors, ["Xin-Zhong Yan", "C. S. Ting"]
43
- assert_equal paper.categories, ["cond-mat.str-el"]
43
+ assert_equal ["Xin-Zhong Yan", "C. S. Ting"], paper.authors
44
+ assert_equal ["cond-mat.str-el"], paper.categories
44
45
  assert_equal paper.comments, "4 pages, 4 figures"
45
46
  assert_equal paper.journal_ref, "PRL 101, 126801 (2008)"
46
47
  assert_equal paper.doi, "10.1103/PhysRevLett.101.126801"
@@ -50,11 +51,11 @@ class TestParser < Minitest::Test
50
51
 
51
52
  # Ensure we handle TeX special characters
52
53
  if paper.id == "0801.3763"
53
- assert_equal "Dijana Žilić", paper.authors[1]
54
+ assert_equal "Dijana Žilić", paper.authors[1]
54
55
 
55
56
  # But make sure we didn't try to parse any
56
57
  # complex math-- that cannot be unicode
57
- assert_includes paper.abstract, "[Cu(bpy)$_3$]$_2$[Cr(C$_2$O$_4$)$_3$]NO$_3⋅$9H$_2$O"
58
+ assert_includes paper.abstract, "[Cu(bpy)$_3$]$_2$[Cr(C$_2$O$_4$)$_3$]NO$_3\\cdot $9H$_2$O"
58
59
  end
59
60
 
60
61
  # Ensure we parse html entities
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arxivsync
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaiden Mispy