arxiv-references 0.1.7.0 → 0.1.7.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/arxiv/references/{Arxiv.rb → ArxivApi.rb} +8 -3
- data/lib/arxiv/references/ArxivReferences.rb +1 -1
- data/lib/arxiv/references/ArxivUtil.rb +23 -0
- data/lib/arxiv/references/{myUtil.rb → P3.rb} +10 -41
- data/lib/arxiv/references/version.rb +1 -1
- data/lib/arxiv/references.rb +0 -1
- metadata +4 -4
- data/lib/arxiv/references/FetchPaperPDF.rb +0 -80
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 41b13e1de94b5b60ded925f2621f87e881262b5b
|
4
|
+
data.tar.gz: 70882f9a8f74f0c8c549f23bd9c284fc0b24458e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8439088f8eb532a5b9c0f092db5e0c918e2c49929669c6cbe8b8592b32ac88e7b63e515acac7c7cef099f7b51c09e6b7f959dd6247d39b54308805a38f572265
|
7
|
+
data.tar.gz: 9434188bcd8afa713bdc2384bb6c77af5f00ef6c9a06aeabf6f6915e48c35272f5e4f89a8a6159772f052a6487f94642eeb833913817eaec761ee29b3d9119ac
|
@@ -2,14 +2,14 @@ require 'open-uri'
|
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'json'
|
4
4
|
|
5
|
-
class
|
5
|
+
class ArxivApi
|
6
6
|
attr_reader :title, :authors, :abstruct, :pdfurl
|
7
7
|
attr_accessor :references
|
8
8
|
BASE_URL = 'https://arxiv.org'
|
9
9
|
def initialize(id)
|
10
|
-
|
10
|
+
id = "#{BASE_URL}/abs/#{id}" if id.index('http').nil?
|
11
11
|
charset = nil
|
12
|
-
html = open(
|
12
|
+
html = open(id) do |f|
|
13
13
|
charset = f.charset
|
14
14
|
f.read
|
15
15
|
end
|
@@ -36,6 +36,11 @@ class Arxiv
|
|
36
36
|
def fetch_pdfurl
|
37
37
|
"#{BASE_URL}#{@page.xpath('//*[@id="abs"]/div[1]/div[1]/ul/li[1]/a').attr('href').value}"
|
38
38
|
end
|
39
|
+
|
40
|
+
def to_json
|
41
|
+
JSON.pretty_generate({title: @title, authors: @authors, abstruct: @abstruct, pdfurl: @pdfurl, references: @references})
|
42
|
+
end
|
43
|
+
|
39
44
|
end
|
40
45
|
|
41
46
|
|
@@ -0,0 +1,23 @@
|
|
1
|
+
|
2
|
+
$:.unshift Pathname.new(__FILE__).dirname.join().expand_path.to_s
|
3
|
+
require 'ArxivApi'
|
4
|
+
require 'P3'
|
5
|
+
|
6
|
+
|
7
|
+
module ArxivUtil
|
8
|
+
BASE_URL = "https://arxiv.org"
|
9
|
+
def self.fetchFromUrl(urlName, work_dir, use_dir, use_pdf)
|
10
|
+
arxiv = ArxivApi.new(urlName)
|
11
|
+
arxiv.references = P3.fetchFromPdfUrl(arxiv.pdfurl, work_dir, use_dir) if use_pdf || use_pdf.nil?
|
12
|
+
return arxiv
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.fetchFromArxivId(id, work_dir, use_dir, use_pdf)
|
16
|
+
target_url = "#{BASE_URL}/abs/#{id}"
|
17
|
+
fetchFromUrl(target_url, work_dir, use_dir, use_pdf)
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.fetchFromPdfUrl(pdfUrl, work_dir, use_dir)
|
21
|
+
return P3.fetchFromPdfUrl(pdfUrl, work_dir, use_dir)
|
22
|
+
end
|
23
|
+
end
|
@@ -1,16 +1,13 @@
|
|
1
|
-
require 'open-uri'
|
2
1
|
require 'digest/sha2'
|
3
2
|
require 'time'
|
4
3
|
require 'fileutils'
|
5
4
|
require 'pty'
|
6
5
|
require 'expect'
|
7
6
|
require 'pdf-reader'
|
8
|
-
require 'nokogiri'
|
9
|
-
require 'json'
|
10
7
|
|
11
|
-
|
8
|
+
class P3
|
12
9
|
BASE_URL = "https://arxiv.org"
|
13
|
-
REFERENCE_START_REGEXP = Regexp.new('
|
10
|
+
REFERENCE_START_REGEXP = Regexp.new('[rR][eE][fF][eE][rR][eE][nN][cC][eE][sS]*')
|
14
11
|
REFERENCE_REGEXP = Regexp.new('(\[[0-9]?[0-9]\]|\[.+?\])')
|
15
12
|
def self.makeId
|
16
13
|
return Digest::SHA256.hexdigest Time.now.strftime("%F %H:%M:%S")
|
@@ -44,31 +41,6 @@ module ArxivUtil
|
|
44
41
|
File.delete("#{work_dir}/#{id}-output.pdf")
|
45
42
|
File.delete("#{work_dir}/#{id}-output_k2opt.pdf")
|
46
43
|
end
|
47
|
-
|
48
|
-
|
49
|
-
def self.fetchFromUrl(urlName, work_dir, use_dir, use_pdf)
|
50
|
-
puts "fetch => #{urlName}"
|
51
|
-
charset = nil
|
52
|
-
html = open(urlName) do |f|
|
53
|
-
charset = f.charset
|
54
|
-
f.read
|
55
|
-
end
|
56
|
-
|
57
|
-
page = Nokogiri::HTML.parse(html, nil, charset)
|
58
|
-
result = {}
|
59
|
-
result[:title] = page.xpath('//*[@id="abs"]/div[2]/h1').children.select{|i| i.name=='text'}.shift.text.gsub(/\n/,'')
|
60
|
-
result[:authors] = page.xpath('//*[@id="abs"]/div[2]/div[2]/a').map(&:text)
|
61
|
-
result[:abstruct] = page.xpath('//*[@id="abs"]/div[2]/blockquote').children.select{|i| i.name = 'text'}.reverse.shift.text
|
62
|
-
result[:pdfurl] = "#{BASE_URL}#{page.xpath('//*[@id="abs"]/div[1]/div[1]/ul/li[1]/a').attr('href').value}"
|
63
|
-
result[:references] = fetchFromPdfUrl(result[:pdfurl], work_dir, use_dir) if use_pdf
|
64
|
-
return result
|
65
|
-
end
|
66
|
-
|
67
|
-
def self.fetchFromArxivId(id, work_dir, use_dir, use_pdf)
|
68
|
-
target_url = "#{BASE_URL}/abs/#{id}"
|
69
|
-
fetchFromUrl(target_url, work_dir, use_dir, use_pdf)
|
70
|
-
end
|
71
|
-
|
72
44
|
def self.fetchPdfFile(pdfUrl,file_name)
|
73
45
|
open(file_name, 'wb') do |o|
|
74
46
|
open(pdfUrl) do |data|
|
@@ -94,7 +66,6 @@ module ArxivUtil
|
|
94
66
|
return getK2Pdf(job_id, work_dir, use_dir)
|
95
67
|
end
|
96
68
|
|
97
|
-
|
98
69
|
def self.fetchReference(file_name)
|
99
70
|
reader = PDF::Reader.new(file_name)
|
100
71
|
page_no = reader.
|
@@ -105,27 +76,25 @@ module ArxivUtil
|
|
105
76
|
map(&:number).
|
106
77
|
sort.
|
107
78
|
shift
|
108
|
-
|
79
|
+
|
109
80
|
ref_page = reader.
|
110
81
|
pages.
|
111
82
|
select{|i|
|
112
83
|
i.number >= page_no
|
113
84
|
}.
|
114
85
|
map{|i|
|
115
|
-
i.text.gsub(/\n+/,"\n").gsub(/ +/,' ')
|
86
|
+
i.text.gsub(/\n\n+/,"\n").gsub(/ +/,' ').gsub(/-\n +/,'')
|
116
87
|
}.
|
117
88
|
join(' ').
|
89
|
+
split("\n").
|
90
|
+
join(' ').
|
118
91
|
gsub(REFERENCE_REGEXP,"\n\\1").
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
return ref_page[(ref_page.index{|i| i =~ REFERENCE_START_REGEXP}+1)..ref_page.length].
|
123
|
-
select{|i|
|
124
|
-
i.length > 5
|
125
|
-
}
|
92
|
+
split("\n").
|
93
|
+
select{|i| i.length > 15}
|
94
|
+
return ref_page
|
126
95
|
end
|
127
96
|
|
128
|
-
def self.fetchFromPdfUrl(pdfUrl, work_dir, use_dir)
|
97
|
+
def self.fetchFromPdfUrl(pdfUrl, work_dir=true, use_dir=true)
|
129
98
|
job_id = makeId
|
130
99
|
makeDir(job_id, work_dir) if use_dir
|
131
100
|
file_name = makeFile(job_id, work_dir, use_dir)
|
data/lib/arxiv/references.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arxiv-references
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.7.
|
4
|
+
version: 0.1.7.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Takahiro Nishimura
|
@@ -115,10 +115,10 @@ files:
|
|
115
115
|
- bin/setup
|
116
116
|
- exe/arxiv-ref
|
117
117
|
- lib/arxiv/references.rb
|
118
|
-
- lib/arxiv/references/
|
118
|
+
- lib/arxiv/references/ArxivApi.rb
|
119
119
|
- lib/arxiv/references/ArxivReferences.rb
|
120
|
-
- lib/arxiv/references/
|
121
|
-
- lib/arxiv/references/
|
120
|
+
- lib/arxiv/references/ArxivUtil.rb
|
121
|
+
- lib/arxiv/references/P3.rb
|
122
122
|
- lib/arxiv/references/version.rb
|
123
123
|
homepage: https://github.com/nishimuuu/Arxiv-references
|
124
124
|
licenses:
|
@@ -1,80 +0,0 @@
|
|
1
|
-
require 'digest/sha2'
|
2
|
-
require 'time'
|
3
|
-
require 'fileutils'
|
4
|
-
require 'pty'
|
5
|
-
require 'expect'
|
6
|
-
require 'pdf-reader'
|
7
|
-
|
8
|
-
class P3
|
9
|
-
def self.fetchPdfFile(pdfUrl,file_name)
|
10
|
-
open(file_name, 'wb') do |o|
|
11
|
-
open(pdfUrl) do |data|
|
12
|
-
o.write(data.read)
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
def self.convertSingleColPdf(job_id, work_dir,file_name, use_dir)
|
18
|
-
cmd = "k2pdfopt -dev kpw #{file_name}"
|
19
|
-
PTY.spawn(cmd) do |i,o|
|
20
|
-
o.sync = true
|
21
|
-
i.expect(/\S.*Enter option above \(h=help, q=quit\):/,10){
|
22
|
-
o.puts "\n"
|
23
|
-
o.flush
|
24
|
-
}
|
25
|
-
while( i.eof? == false )
|
26
|
-
res = i.gets
|
27
|
-
print res
|
28
|
-
break unless res.index('written').nil?
|
29
|
-
end
|
30
|
-
end
|
31
|
-
return getK2Pdf(job_id, work_dir, use_dir)
|
32
|
-
end
|
33
|
-
|
34
|
-
def self.fetchReference(file_name)
|
35
|
-
reader = PDF::Reader.new(file_name)
|
36
|
-
page_no = reader.
|
37
|
-
pages.
|
38
|
-
reject{|i|
|
39
|
-
i.text.index(REFERENCE_START_REGEXP).nil?
|
40
|
-
}.
|
41
|
-
map(&:number).
|
42
|
-
sort.
|
43
|
-
shift
|
44
|
-
puts "Detect References page=> #{page_no} "
|
45
|
-
ref_page = reader.
|
46
|
-
pages.
|
47
|
-
select{|i|
|
48
|
-
i.number >= page_no
|
49
|
-
}.
|
50
|
-
map{|i|
|
51
|
-
i.text.gsub(/\n+/,"\n").gsub(/ +/,' ')
|
52
|
-
}.
|
53
|
-
join(' ').
|
54
|
-
gsub(REFERENCE_REGEXP,"\n\\1").
|
55
|
-
gsub('- ','').
|
56
|
-
split("\n")
|
57
|
-
|
58
|
-
return ref_page[(ref_page.index{|i| i =~ REFERENCE_START_REGEXP}+1)..ref_page.length].
|
59
|
-
select{|i|
|
60
|
-
i.length > 5
|
61
|
-
}
|
62
|
-
end
|
63
|
-
|
64
|
-
def self.fetchFromPdfUrl(pdfUrl, work_dir, use_dir)
|
65
|
-
job_id = makeId
|
66
|
-
makeDir(job_id, work_dir) if use_dir
|
67
|
-
file_name = makeFile(job_id, work_dir, use_dir)
|
68
|
-
|
69
|
-
fetchPdfFile(pdfUrl, file_name)
|
70
|
-
executed_pdf = convertSingleColPdf(job_id, work_dir, file_name, use_dir)
|
71
|
-
references = fetchReference(executed_pdf)
|
72
|
-
if use_dir
|
73
|
-
removeDir(job_id, work_dir)
|
74
|
-
else
|
75
|
-
removeFile(job_id, work_dir)
|
76
|
-
end
|
77
|
-
return references
|
78
|
-
end
|
79
|
-
|
80
|
-
end
|