arxiv-references 0.1.6.5 → 0.1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +8 -2
- data/lib/arxiv/references/Arxiv.rb +41 -0
- data/lib/arxiv/references/ArxivReferences.rb +9 -15
- data/lib/arxiv/references/FetchPaperPDF.rb +80 -0
- data/lib/arxiv/references/myUtil.rb +9 -8
- data/lib/arxiv/references/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c1fd3a9552e15b293ec4f57b89a2393e840f69d9
|
4
|
+
data.tar.gz: 2cee2a9b1dc53d287c44c79cfd6f18e50d109118
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d6e24ff8fb896d0658fd7526ec62f61078a377f2b3df8ad200c4bd25cab97b0bba34ca6337991b5806eb048967e7e8ca3d17030020adc46ee8873f03612f963f
|
7
|
+
data.tar.gz: 46c78134b1b051e495162d4ce659e0ea4f7134991eaf824cd2c0b96c2875c4aa0da4ce64db1dbfb87b2db81e97846b7afc3c797f571d54dfd3a7326c66c90c73
|
data/README.md
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
[](https://travis-ci.org/nishimuuu/Arxiv-references)
|
1
2
|
[](https://badge.fury.io/rb/arxiv-references)
|
2
3
|
[](https://img.shields.io/badge/LICENSE-GPL-blue.svg)
|
3
4
|
[](https://codeclimate.com/github/nishimuuu/Arxiv-references)
|
@@ -6,6 +7,11 @@
|
|
6
7
|
|
7
8
|
Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/arxiv/references`. To experiment with that code, run `bin/console` for an interactive prompt.
|
8
9
|
|
10
|
+
## Demo
|
11
|
+
[URL](http://153.126.133.121/arxiv-references-api/html/index.html)
|
12
|
+
|
13
|
+
[API Document](http://153.126.133.121/arxiv-references-api/html/api.html)
|
14
|
+
|
9
15
|
## Dependencies
|
10
16
|
|
11
17
|
- k2pdfopt (http://www.willus.com/k2pdfopt/)
|
@@ -46,8 +52,8 @@ Or install it yourself as:
|
|
46
52
|
|
47
53
|
### Options
|
48
54
|
--work_dir : [default: /tmp] working directory to convert multi column pdf to one column
|
49
|
-
--
|
50
|
-
|
55
|
+
--dir : [default: true] create working directory or not
|
56
|
+
--pdf : [default: false] if you don't need citations list, add option `--no-pdf`
|
51
57
|
|
52
58
|
## Development
|
53
59
|
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
class Arxiv
|
6
|
+
attr_reader :title, :authors, :abstruct, :pdfurl
|
7
|
+
attr_accessor :references
|
8
|
+
BASE_URL = 'https://arxiv.org'
|
9
|
+
def initialize(id)
|
10
|
+
url = "#{BASE_URL}/abs/#{id}" if id.index('http').nil?
|
11
|
+
charset = nil
|
12
|
+
html = open(url) do |f|
|
13
|
+
charset = f.charset
|
14
|
+
f.read
|
15
|
+
end
|
16
|
+
@page = Nokogiri::HTML.parse(html, nil, charset)
|
17
|
+
@title = fetch_title
|
18
|
+
@authors = fetch_authors
|
19
|
+
@abstruct = fetch_abstruct
|
20
|
+
@pdfurl = fetch_pdfurl
|
21
|
+
@references = nil
|
22
|
+
end
|
23
|
+
|
24
|
+
def fetch_title
|
25
|
+
@page.xpath('//*[@id="abs"]/div[2]/h1').children.select{|i| i.name=='text'}.shift.text.gsub(/\n/,'')
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch_authors
|
29
|
+
@page.xpath('//*[@id="abs"]/div[2]/div[2]/a').map(&:text)
|
30
|
+
end
|
31
|
+
|
32
|
+
def fetch_abstruct
|
33
|
+
@page.xpath('//*[@id="abs"]/div[2]/blockquote').children.select{|i| i.name = 'text'}.reverse.shift.text
|
34
|
+
end
|
35
|
+
|
36
|
+
def fetch_pdfurl
|
37
|
+
"#{BASE_URL}#{@page.xpath('//*[@id="abs"]/div[1]/div[1]/ul/li[1]/a').attr('href').value}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
|
@@ -5,36 +5,30 @@ require 'pathname'
|
|
5
5
|
lib = Pathname.new(__FILE__).dirname.join().expand_path
|
6
6
|
$:.unshift lib.to_s
|
7
7
|
require 'myUtil'
|
8
|
+
require 'json'
|
8
9
|
|
9
10
|
|
10
11
|
module ArxivReferences
|
11
12
|
class CLI < Thor
|
12
13
|
include ArxivUtil
|
14
|
+
class_option 'work_dir', type: :string, aliases: 'Working dir', desc: 'Set working dir(default: /tmp)', default: '/tmp'
|
15
|
+
class_option 'dir', type: :boolean, aliases: 'Working in dir', desc: 'work to make dir or not(default: true)', default: true
|
16
|
+
class_option 'pdf', type: :boolean, aliases: 'Parse PDF', desc: 'fetch pdf information(defaut: true)', default: true
|
17
|
+
|
18
|
+
|
13
19
|
desc 'url', 'Extract references from arxiv URL'
|
14
|
-
option 'work_dir', type: :string, aliases: '-work', desc: 'Set working dir(default: /tmp)'
|
15
|
-
option 'use_dir', type: :boolean, aliases: '-use_dir', desc: 'work to make dir or not(default: true)'
|
16
20
|
def url(urlName)
|
17
|
-
|
18
|
-
use_dir = options['use_dir'].nil? ? true : options['use_dir']
|
19
|
-
puts ArxivUtil.fetchFromUrl(urlName, work_dir, use_dir)
|
21
|
+
puts ArxivUtil.fetchFromUrl(urlName, options[:work_dir], options[:use_dir], options[:no_pdf]).to_json
|
20
22
|
end
|
21
23
|
|
22
24
|
desc 'id', 'Extract references from Arxiv id'
|
23
|
-
option 'work_dir', type: :string, aliases: '-work', desc: 'Set working dir(default: /tmp)'
|
24
|
-
option 'use_dir', type: :boolean, aliases: '-use_dir', desc: 'work to make dir or not(default: true)'
|
25
25
|
def arxivid(idName)
|
26
|
-
|
27
|
-
use_dir = options['use_dir'].nil? ? true : options['use_dir']
|
28
|
-
puts ArxivUtil.fetchFromArxivId(idName, work_dir, use_dir)
|
26
|
+
puts ArxivUtil.fetchFromArxivId(idName, options[:work_dir], options[:use_dir], options[:no_pdf]).to_json
|
29
27
|
end
|
30
28
|
|
31
29
|
desc 'pdfurl', 'Extract references from pdf URL'
|
32
|
-
option 'work_dir', type: :string, aliases: '-work', desc: 'Set working dir(default: /tmp)'
|
33
|
-
option 'use_dir', type: :boolean, aliases: '-use_dir', desc: 'work to make dir or not(default: true)'
|
34
30
|
def pdfurl(pdfUrlName)
|
35
|
-
|
36
|
-
use_dir = options['use_dir'].nil? ? true : options['use_dir']
|
37
|
-
puts ArxivUtil.fetchFromPdfUrl(pdfUrlName, work_dir, use_dir)
|
31
|
+
puts ArxivUtil.fetchFromPdfUrl(pdfUrlName, options[:work_dir], options[:use_dir]).to_json
|
38
32
|
end
|
39
33
|
end
|
40
34
|
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'digest/sha2'
|
2
|
+
require 'time'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'pty'
|
5
|
+
require 'expect'
|
6
|
+
require 'pdf-reader'
|
7
|
+
|
8
|
+
class P3
|
9
|
+
def self.fetchPdfFile(pdfUrl,file_name)
|
10
|
+
open(file_name, 'wb') do |o|
|
11
|
+
open(pdfUrl) do |data|
|
12
|
+
o.write(data.read)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.convertSingleColPdf(job_id, work_dir,file_name, use_dir)
|
18
|
+
cmd = "k2pdfopt -dev kpw #{file_name}"
|
19
|
+
PTY.spawn(cmd) do |i,o|
|
20
|
+
o.sync = true
|
21
|
+
i.expect(/\S.*Enter option above \(h=help, q=quit\):/,10){
|
22
|
+
o.puts "\n"
|
23
|
+
o.flush
|
24
|
+
}
|
25
|
+
while( i.eof? == false )
|
26
|
+
res = i.gets
|
27
|
+
print res
|
28
|
+
break unless res.index('written').nil?
|
29
|
+
end
|
30
|
+
end
|
31
|
+
return getK2Pdf(job_id, work_dir, use_dir)
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.fetchReference(file_name)
|
35
|
+
reader = PDF::Reader.new(file_name)
|
36
|
+
page_no = reader.
|
37
|
+
pages.
|
38
|
+
reject{|i|
|
39
|
+
i.text.index(REFERENCE_START_REGEXP).nil?
|
40
|
+
}.
|
41
|
+
map(&:number).
|
42
|
+
sort.
|
43
|
+
shift
|
44
|
+
puts "Detect References page=> #{page_no} "
|
45
|
+
ref_page = reader.
|
46
|
+
pages.
|
47
|
+
select{|i|
|
48
|
+
i.number >= page_no
|
49
|
+
}.
|
50
|
+
map{|i|
|
51
|
+
i.text.gsub(/\n+/,"\n").gsub(/ +/,' ')
|
52
|
+
}.
|
53
|
+
join(' ').
|
54
|
+
gsub(REFERENCE_REGEXP,"\n\\1").
|
55
|
+
gsub('- ','').
|
56
|
+
split("\n")
|
57
|
+
|
58
|
+
return ref_page[(ref_page.index{|i| i =~ REFERENCE_START_REGEXP}+1)..ref_page.length].
|
59
|
+
select{|i|
|
60
|
+
i.length > 5
|
61
|
+
}
|
62
|
+
end
|
63
|
+
|
64
|
+
def self.fetchFromPdfUrl(pdfUrl, work_dir, use_dir)
|
65
|
+
job_id = makeId
|
66
|
+
makeDir(job_id, work_dir) if use_dir
|
67
|
+
file_name = makeFile(job_id, work_dir, use_dir)
|
68
|
+
|
69
|
+
fetchPdfFile(pdfUrl, file_name)
|
70
|
+
executed_pdf = convertSingleColPdf(job_id, work_dir, file_name, use_dir)
|
71
|
+
references = fetchReference(executed_pdf)
|
72
|
+
if use_dir
|
73
|
+
removeDir(job_id, work_dir)
|
74
|
+
else
|
75
|
+
removeFile(job_id, work_dir)
|
76
|
+
end
|
77
|
+
return references
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
@@ -7,6 +7,7 @@ require 'expect'
|
|
7
7
|
require 'pdf-reader'
|
8
8
|
require 'nokogiri'
|
9
9
|
require 'json'
|
10
|
+
|
10
11
|
module ArxivUtil
|
11
12
|
BASE_URL = "https://arxiv.org"
|
12
13
|
REFERENCE_START_REGEXP = Regexp.new('References|REFERENCES|Reference|REFERENCE')
|
@@ -45,7 +46,7 @@ module ArxivUtil
|
|
45
46
|
end
|
46
47
|
|
47
48
|
|
48
|
-
def self.fetchFromUrl(urlName, work_dir, use_dir)
|
49
|
+
def self.fetchFromUrl(urlName, work_dir, use_dir, use_pdf)
|
49
50
|
puts "fetch => #{urlName}"
|
50
51
|
charset = nil
|
51
52
|
html = open(urlName) do |f|
|
@@ -55,17 +56,17 @@ module ArxivUtil
|
|
55
56
|
|
56
57
|
page = Nokogiri::HTML.parse(html, nil, charset)
|
57
58
|
result = {}
|
58
|
-
result[:title] = page.xpath('//*[@id="abs"]/div[2]/h1').text
|
59
|
-
result[:authors] = page.xpath('//*[@id="abs"]/div[2]/div[2]/a').text
|
60
|
-
result[:abstruct] = page.xpath('//*[@id="abs"]/div[2]/blockquote').text
|
59
|
+
result[:title] = page.xpath('//*[@id="abs"]/div[2]/h1').children.select{|i| i.name=='text'}.shift.text.gsub(/\n/,'')
|
60
|
+
result[:authors] = page.xpath('//*[@id="abs"]/div[2]/div[2]/a').map(&:text)
|
61
|
+
result[:abstruct] = page.xpath('//*[@id="abs"]/div[2]/blockquote').children.select{|i| i.name = 'text'}.reverse.shift.text
|
61
62
|
result[:pdfurl] = "#{BASE_URL}#{page.xpath('//*[@id="abs"]/div[1]/div[1]/ul/li[1]/a').attr('href').value}"
|
62
|
-
result[:references] = fetchFromPdfUrl(result[:pdfurl], work_dir, use_dir)
|
63
|
-
return result
|
63
|
+
result[:references] = fetchFromPdfUrl(result[:pdfurl], work_dir, use_dir) if use_pdf
|
64
|
+
return result
|
64
65
|
end
|
65
66
|
|
66
|
-
def self.fetchFromArxivId(id, work_dir, use_dir)
|
67
|
+
def self.fetchFromArxivId(id, work_dir, use_dir, use_pdf)
|
67
68
|
target_url = "#{BASE_URL}/abs/#{id}"
|
68
|
-
fetchFromUrl(target_url, work_dir, use_dir)
|
69
|
+
fetchFromUrl(target_url, work_dir, use_dir, use_pdf)
|
69
70
|
end
|
70
71
|
|
71
72
|
def self.fetchPdfFile(pdfUrl,file_name)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arxiv-references
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Takahiro Nishimura
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -115,7 +115,9 @@ files:
|
|
115
115
|
- bin/setup
|
116
116
|
- exe/arxiv-ref
|
117
117
|
- lib/arxiv/references.rb
|
118
|
+
- lib/arxiv/references/Arxiv.rb
|
118
119
|
- lib/arxiv/references/ArxivReferences.rb
|
120
|
+
- lib/arxiv/references/FetchPaperPDF.rb
|
119
121
|
- lib/arxiv/references/myUtil.rb
|
120
122
|
- lib/arxiv/references/version.rb
|
121
123
|
homepage: https://github.com/nishimuuu/Arxiv-references
|