arxivsync 0.0.8 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6f23b2225a55b6369e6c84961b6328247289573b
4
- data.tar.gz: dfc6064821a4a6d7c12434cff5ae551c4f784600
3
+ metadata.gz: 9125d2dc0e6a12de5a4ec015f45d12801189090f
4
+ data.tar.gz: 71ad86f611dc9153ee668580078840109e2d8394
5
5
  SHA512:
6
- metadata.gz: 44ccf8d910693b46719262f0778b76e9933a5d7bd521bd75dfb6c1a17205f8efc61d2549f0e7a16d62b2b4b74ff966df5b52ece62997219e4251595ad90f0054
7
- data.tar.gz: 249f809cfc44b4bf32a5fadf072bc87be91a98fb6a4fcf23f1dddd00f171402cf78fdb44e44fd23b3bcf11d49bcedbc8cd28ba872d4e89f3f1760845ec0feb68
6
+ metadata.gz: 8fc09072898e94ce1903df950d87fc4ab2a3ef737355f014afc473ca9e28aa6f154c74e7622be6f0097fbd2284d76645c7bbbcbf2b7a0c61ecd7faa75463cbf2
7
+ data.tar.gz: f19f459fa163cac5044457fc9aa88355cd6d9faecc04bbe4cb67f28b411fbef0ef55c892c47a26c8b97811fa8b9237e8194f3ea4b63f1be39c20dcfce57145c0
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # arxivsync 0.0.8
1
+ # arxivsync 0.0.9
2
2
 
3
3
  Ruby OAI interface for harvesting the arXiv. Can be used to store and update an XML mirror of paper metadata, and parse the XML into Ruby objects to allow conversion into a friendlier format.
4
4
 
data/arxivsync.gemspec CHANGED
@@ -25,7 +25,8 @@ Gem::Specification.new do |spec|
25
25
 
26
26
  spec.add_runtime_dependency "oai"
27
27
  spec.add_runtime_dependency "colorize"
28
- spec.add_runtime_dependency "latex-decode"
28
+ spec.add_runtime_dependency "htmlentities", "~> 4.3.1"
29
+ spec.add_runtime_dependency "latex-decode", "~> 0.1.1"
29
30
  spec.add_runtime_dependency "ox", ">= 2.0.2" # Super-fast XML parser
30
31
  spec.add_runtime_dependency "nokogiri" # Slower but more accurate parser
31
32
  end
@@ -27,6 +27,10 @@ module ArxivSync
27
27
  class XMLParser < ::Ox::Sax
28
28
  attr_accessor :papers
29
29
 
30
+ def initialize
31
+ @entities = HTMLEntities.new
32
+ end
33
+
30
34
  def start_element(name, attributes=[])
31
35
  @el = name
32
36
  case name
@@ -34,7 +38,7 @@ module ArxivSync
34
38
  @papers = []
35
39
  when :metadata
36
40
  @model = Paper.new
37
- @versions = []
41
+ @model.versions = []
38
42
  when :version
39
43
  @version = Version.new
40
44
  end
@@ -44,27 +48,52 @@ module ArxivSync
44
48
  str.gsub(/\s+/, ' ').strip
45
49
  end
46
50
 
51
+ def decode(string)
52
+ str = @entities.decode(string)
53
+ LaTeX::Decode::Base.normalize(str)
54
+ LaTeX::Decode::Accents.decode!(str)
55
+ LaTeX::Decode::Diacritics.decode!(str)
56
+ LaTeX::Decode::Symbols.decode!(str)
57
+ str
58
+ end
59
+
47
60
  def text(str)
48
61
  case @el
49
62
  # Necessary elements
50
63
  when :id
51
64
  @model.id = clean(str)
52
65
  when :submitter
53
- @model.submitter = clean(str)
66
+ @model.submitter = decode(clean(str))
54
67
  when :title
55
- @model.title = clean(str)
68
+ @model.title = decode(clean(str))
56
69
  when :authors
57
- regex = /(?:,| and )(?![^\(\)]*+\))/
58
- @model.authors = clean(str).split(regex)
59
- .map { |s| LaTeX.decode(clean(s)) }.reject { |s| s.empty? }
70
+ # Author strings may contain strange metadata
71
+ # Non-regex parsing to handle nested parens
72
+ depth = 0
73
+ no_parens = ""
74
+
75
+ str.chars do |ch|
76
+ case ch
77
+ when '('
78
+ depth += 1
79
+ when ')'
80
+ depth -= 1
81
+ else
82
+ no_parens << ch if depth == 0
83
+ end
84
+ end
85
+
86
+ @model.authors = clean(no_parens).split(/,| and /)
87
+ .map { |s| decode(clean(s)) }
88
+ .reject { |s| s.empty? }
60
89
  when :categories
61
90
  @model.categories = clean(str).split(/\s/)
62
91
  when :abstract
63
- @model.abstract = clean(str)
92
+ @model.abstract = decode(clean(str))
64
93
 
65
94
  # Optional elements
66
95
  when :comments
67
- @model.comments = clean(str)
96
+ @model.comments = decode(clean(str))
68
97
  when :"msc-class"
69
98
  @model.msc_class = clean(str)
70
99
  when :"report-no"
@@ -89,10 +118,8 @@ module ArxivSync
89
118
  def end_element(name)
90
119
  case name
91
120
  when :version
92
- @versions.push(@version)
121
+ @model.versions.push(@version)
93
122
  when :metadata # End of a paper entry
94
- @model.versions = @versions
95
-
96
123
  @papers.push(@model)
97
124
  end
98
125
  @el = nil
@@ -1,3 +1,3 @@
1
1
  module ArxivSync
2
- VERSION = "0.0.8"
2
+ VERSION = "0.0.9"
3
3
  end
data/lib/arxivsync.rb CHANGED
@@ -2,7 +2,9 @@ require 'oai'
2
2
  require 'nokogiri'
3
3
  require 'ox'
4
4
  require 'colorize'
5
+ require 'htmlentities'
5
6
  require 'latex/decode'
7
+
6
8
  require 'arxivsync/version'
7
9
  require 'arxivsync/parser'
8
10
  require 'arxivsync/downloader'
data/test/parser_test.rb CHANGED
@@ -48,9 +48,30 @@ class TestParser < Minitest::Test
48
48
  tested += 1
49
49
  end
50
50
 
51
- if paper.id == "0801.3713"
52
- # Ensure latex-decode works properly
53
- assert_equal "Jean-Claude Léon (LGS)", paper.authors[2]
51
+ # Ensure we handle TeX special characters
52
+ if paper.id == "0801.3763"
53
+ assert_equal "Dijana Žilić", paper.authors[1]
54
+
55
+ # But make sure we didn't try to parse any
56
+ # complex math-- that cannot be unicode
57
+ assert_includes paper.abstract, "[Cu(bpy)$_3$]$_2$[Cr(C$_2$O$_4$)$_3$]NO$_3⋅$9H$_2$O"
58
+ end
59
+
60
+ # Ensure we parse html entities
61
+ if paper.id == "0801.3778"
62
+ assert_equal "6 pages, 10 figures, to appear in \"Young massive clusters, initial conditions and environments\", typo in author's name corrected", paper.comments
63
+ elsif paper.id == "0801.3789"
64
+ assert_includes paper.abstract, "The addition of this \"conservative noise\" allows"
65
+ end
66
+
67
+ # And weird author strings
68
+ if paper.id == "0801.3898"
69
+ assert_equal ["A. Frasca", "Zs. Kovari", "K.G. Strassmeier", "K. Biazzo"], paper.authors
70
+ end
71
+
72
+ # And those pesky "and"s
73
+ if paper.id == "0801.3674"
74
+ assert_equal ["Robert H. Brandenberger", "Keshav Dasgupta", "Anne-Christine Davis"], paper.authors
54
75
  end
55
76
  end
56
77
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arxivsync
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaiden Mispy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-01-27 00:00:00.000000000 Z
11
+ date: 2014-01-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -94,20 +94,34 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: htmlentities
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: 4.3.1
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: 4.3.1
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: latex-decode
99
113
  requirement: !ruby/object:Gem::Requirement
100
114
  requirements:
101
- - - ">="
115
+ - - "~>"
102
116
  - !ruby/object:Gem::Version
103
- version: '0'
117
+ version: 0.1.1
104
118
  type: :runtime
105
119
  prerelease: false
106
120
  version_requirements: !ruby/object:Gem::Requirement
107
121
  requirements:
108
- - - ">="
122
+ - - "~>"
109
123
  - !ruby/object:Gem::Version
110
- version: '0'
124
+ version: 0.1.1
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: ox
113
127
  requirement: !ruby/object:Gem::Requirement