arxiv-references 0.1.6.5 → 0.1.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +8 -2
- data/lib/arxiv/references/Arxiv.rb +41 -0
- data/lib/arxiv/references/ArxivReferences.rb +9 -15
- data/lib/arxiv/references/FetchPaperPDF.rb +80 -0
- data/lib/arxiv/references/myUtil.rb +9 -8
- data/lib/arxiv/references/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c1fd3a9552e15b293ec4f57b89a2393e840f69d9
|
4
|
+
data.tar.gz: 2cee2a9b1dc53d287c44c79cfd6f18e50d109118
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d6e24ff8fb896d0658fd7526ec62f61078a377f2b3df8ad200c4bd25cab97b0bba34ca6337991b5806eb048967e7e8ca3d17030020adc46ee8873f03612f963f
|
7
|
+
data.tar.gz: 46c78134b1b051e495162d4ce659e0ea4f7134991eaf824cd2c0b96c2875c4aa0da4ce64db1dbfb87b2db81e97846b7afc3c797f571d54dfd3a7326c66c90c73
|
data/README.md
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
[![Build Status](https://travis-ci.org/nishimuuu/Arxiv-references.svg?branch=master)](https://travis-ci.org/nishimuuu/Arxiv-references)
|
1
2
|
[![Gem Version](https://badge.fury.io/rb/arxiv-references.svg)](https://badge.fury.io/rb/arxiv-references)
|
2
3
|
[![LICENSES](https://img.shields.io/badge/LICENSE-GPL-blue.svg)](https://img.shields.io/badge/LICENSE-GPL-blue.svg)
|
3
4
|
[![Code Climate](https://codeclimate.com/github/nishimuuu/Arxiv-references/badges/gpa.svg)](https://codeclimate.com/github/nishimuuu/Arxiv-references)
|
@@ -6,6 +7,11 @@
|
|
6
7
|
|
7
8
|
Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/arxiv/references`. To experiment with that code, run `bin/console` for an interactive prompt.
|
8
9
|
|
10
|
+
## Demo
|
11
|
+
[URL](http://153.126.133.121/arxiv-references-api/html/index.html)
|
12
|
+
|
13
|
+
[API Document](http://153.126.133.121/arxiv-references-api/html/api.html)
|
14
|
+
|
9
15
|
## Dependencies
|
10
16
|
|
11
17
|
- k2pdfopt (http://www.willus.com/k2pdfopt/)
|
@@ -46,8 +52,8 @@ Or install it yourself as:
|
|
46
52
|
|
47
53
|
### Options
|
48
54
|
--work_dir : [default: /tmp] working directory to convert multi column pdf to one column
|
49
|
-
--
|
50
|
-
|
55
|
+
--dir : [default: true] create working directory or not
|
56
|
+
--pdf : [default: false] if you don't need citations list, add option `--no-pdf`
|
51
57
|
|
52
58
|
## Development
|
53
59
|
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
class Arxiv
|
6
|
+
attr_reader :title, :authors, :abstruct, :pdfurl
|
7
|
+
attr_accessor :references
|
8
|
+
BASE_URL = 'https://arxiv.org'
|
9
|
+
def initialize(id)
|
10
|
+
url = "#{BASE_URL}/abs/#{id}" if id.index('http').nil?
|
11
|
+
charset = nil
|
12
|
+
html = open(url) do |f|
|
13
|
+
charset = f.charset
|
14
|
+
f.read
|
15
|
+
end
|
16
|
+
@page = Nokogiri::HTML.parse(html, nil, charset)
|
17
|
+
@title = fetch_title
|
18
|
+
@authors = fetch_authors
|
19
|
+
@abstruct = fetch_abstruct
|
20
|
+
@pdfurl = fetch_pdfurl
|
21
|
+
@references = nil
|
22
|
+
end
|
23
|
+
|
24
|
+
def fetch_title
|
25
|
+
@page.xpath('//*[@id="abs"]/div[2]/h1').children.select{|i| i.name=='text'}.shift.text.gsub(/\n/,'')
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch_authors
|
29
|
+
@page.xpath('//*[@id="abs"]/div[2]/div[2]/a').map(&:text)
|
30
|
+
end
|
31
|
+
|
32
|
+
def fetch_abstruct
|
33
|
+
@page.xpath('//*[@id="abs"]/div[2]/blockquote').children.select{|i| i.name = 'text'}.reverse.shift.text
|
34
|
+
end
|
35
|
+
|
36
|
+
def fetch_pdfurl
|
37
|
+
"#{BASE_URL}#{@page.xpath('//*[@id="abs"]/div[1]/div[1]/ul/li[1]/a').attr('href').value}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
|
@@ -5,36 +5,30 @@ require 'pathname'
|
|
5
5
|
lib = Pathname.new(__FILE__).dirname.join().expand_path
|
6
6
|
$:.unshift lib.to_s
|
7
7
|
require 'myUtil'
|
8
|
+
require 'json'
|
8
9
|
|
9
10
|
|
10
11
|
module ArxivReferences
|
11
12
|
class CLI < Thor
|
12
13
|
include ArxivUtil
|
14
|
+
class_option 'work_dir', type: :string, aliases: 'Working dir', desc: 'Set working dir(default: /tmp)', default: '/tmp'
|
15
|
+
class_option 'dir', type: :boolean, aliases: 'Working in dir', desc: 'work to make dir or not(default: true)', default: true
|
16
|
+
class_option 'pdf', type: :boolean, aliases: 'Parse PDF', desc: 'fetch pdf information(defaut: true)', default: true
|
17
|
+
|
18
|
+
|
13
19
|
desc 'url', 'Extract references from arxiv URL'
|
14
|
-
option 'work_dir', type: :string, aliases: '-work', desc: 'Set working dir(default: /tmp)'
|
15
|
-
option 'use_dir', type: :boolean, aliases: '-use_dir', desc: 'work to make dir or not(default: true)'
|
16
20
|
def url(urlName)
|
17
|
-
|
18
|
-
use_dir = options['use_dir'].nil? ? true : options['use_dir']
|
19
|
-
puts ArxivUtil.fetchFromUrl(urlName, work_dir, use_dir)
|
21
|
+
puts ArxivUtil.fetchFromUrl(urlName, options[:work_dir], options[:use_dir], options[:no_pdf]).to_json
|
20
22
|
end
|
21
23
|
|
22
24
|
desc 'id', 'Extract references from Arxiv id'
|
23
|
-
option 'work_dir', type: :string, aliases: '-work', desc: 'Set working dir(default: /tmp)'
|
24
|
-
option 'use_dir', type: :boolean, aliases: '-use_dir', desc: 'work to make dir or not(default: true)'
|
25
25
|
def arxivid(idName)
|
26
|
-
|
27
|
-
use_dir = options['use_dir'].nil? ? true : options['use_dir']
|
28
|
-
puts ArxivUtil.fetchFromArxivId(idName, work_dir, use_dir)
|
26
|
+
puts ArxivUtil.fetchFromArxivId(idName, options[:work_dir], options[:use_dir], options[:no_pdf]).to_json
|
29
27
|
end
|
30
28
|
|
31
29
|
desc 'pdfurl', 'Extract references from pdf URL'
|
32
|
-
option 'work_dir', type: :string, aliases: '-work', desc: 'Set working dir(default: /tmp)'
|
33
|
-
option 'use_dir', type: :boolean, aliases: '-use_dir', desc: 'work to make dir or not(default: true)'
|
34
30
|
def pdfurl(pdfUrlName)
|
35
|
-
|
36
|
-
use_dir = options['use_dir'].nil? ? true : options['use_dir']
|
37
|
-
puts ArxivUtil.fetchFromPdfUrl(pdfUrlName, work_dir, use_dir)
|
31
|
+
puts ArxivUtil.fetchFromPdfUrl(pdfUrlName, options[:work_dir], options[:use_dir]).to_json
|
38
32
|
end
|
39
33
|
end
|
40
34
|
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'digest/sha2'
|
2
|
+
require 'time'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'pty'
|
5
|
+
require 'expect'
|
6
|
+
require 'pdf-reader'
|
7
|
+
|
8
|
+
class P3
|
9
|
+
def self.fetchPdfFile(pdfUrl,file_name)
|
10
|
+
open(file_name, 'wb') do |o|
|
11
|
+
open(pdfUrl) do |data|
|
12
|
+
o.write(data.read)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.convertSingleColPdf(job_id, work_dir,file_name, use_dir)
|
18
|
+
cmd = "k2pdfopt -dev kpw #{file_name}"
|
19
|
+
PTY.spawn(cmd) do |i,o|
|
20
|
+
o.sync = true
|
21
|
+
i.expect(/\S.*Enter option above \(h=help, q=quit\):/,10){
|
22
|
+
o.puts "\n"
|
23
|
+
o.flush
|
24
|
+
}
|
25
|
+
while( i.eof? == false )
|
26
|
+
res = i.gets
|
27
|
+
print res
|
28
|
+
break unless res.index('written').nil?
|
29
|
+
end
|
30
|
+
end
|
31
|
+
return getK2Pdf(job_id, work_dir, use_dir)
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.fetchReference(file_name)
|
35
|
+
reader = PDF::Reader.new(file_name)
|
36
|
+
page_no = reader.
|
37
|
+
pages.
|
38
|
+
reject{|i|
|
39
|
+
i.text.index(REFERENCE_START_REGEXP).nil?
|
40
|
+
}.
|
41
|
+
map(&:number).
|
42
|
+
sort.
|
43
|
+
shift
|
44
|
+
puts "Detect References page=> #{page_no} "
|
45
|
+
ref_page = reader.
|
46
|
+
pages.
|
47
|
+
select{|i|
|
48
|
+
i.number >= page_no
|
49
|
+
}.
|
50
|
+
map{|i|
|
51
|
+
i.text.gsub(/\n+/,"\n").gsub(/ +/,' ')
|
52
|
+
}.
|
53
|
+
join(' ').
|
54
|
+
gsub(REFERENCE_REGEXP,"\n\\1").
|
55
|
+
gsub('- ','').
|
56
|
+
split("\n")
|
57
|
+
|
58
|
+
return ref_page[(ref_page.index{|i| i =~ REFERENCE_START_REGEXP}+1)..ref_page.length].
|
59
|
+
select{|i|
|
60
|
+
i.length > 5
|
61
|
+
}
|
62
|
+
end
|
63
|
+
|
64
|
+
def self.fetchFromPdfUrl(pdfUrl, work_dir, use_dir)
|
65
|
+
job_id = makeId
|
66
|
+
makeDir(job_id, work_dir) if use_dir
|
67
|
+
file_name = makeFile(job_id, work_dir, use_dir)
|
68
|
+
|
69
|
+
fetchPdfFile(pdfUrl, file_name)
|
70
|
+
executed_pdf = convertSingleColPdf(job_id, work_dir, file_name, use_dir)
|
71
|
+
references = fetchReference(executed_pdf)
|
72
|
+
if use_dir
|
73
|
+
removeDir(job_id, work_dir)
|
74
|
+
else
|
75
|
+
removeFile(job_id, work_dir)
|
76
|
+
end
|
77
|
+
return references
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
@@ -7,6 +7,7 @@ require 'expect'
|
|
7
7
|
require 'pdf-reader'
|
8
8
|
require 'nokogiri'
|
9
9
|
require 'json'
|
10
|
+
|
10
11
|
module ArxivUtil
|
11
12
|
BASE_URL = "https://arxiv.org"
|
12
13
|
REFERENCE_START_REGEXP = Regexp.new('References|REFERENCES|Reference|REFERENCE')
|
@@ -45,7 +46,7 @@ module ArxivUtil
|
|
45
46
|
end
|
46
47
|
|
47
48
|
|
48
|
-
def self.fetchFromUrl(urlName, work_dir, use_dir)
|
49
|
+
def self.fetchFromUrl(urlName, work_dir, use_dir, use_pdf)
|
49
50
|
puts "fetch => #{urlName}"
|
50
51
|
charset = nil
|
51
52
|
html = open(urlName) do |f|
|
@@ -55,17 +56,17 @@ module ArxivUtil
|
|
55
56
|
|
56
57
|
page = Nokogiri::HTML.parse(html, nil, charset)
|
57
58
|
result = {}
|
58
|
-
result[:title] = page.xpath('//*[@id="abs"]/div[2]/h1').text
|
59
|
-
result[:authors] = page.xpath('//*[@id="abs"]/div[2]/div[2]/a').text
|
60
|
-
result[:abstruct] = page.xpath('//*[@id="abs"]/div[2]/blockquote').text
|
59
|
+
result[:title] = page.xpath('//*[@id="abs"]/div[2]/h1').children.select{|i| i.name=='text'}.shift.text.gsub(/\n/,'')
|
60
|
+
result[:authors] = page.xpath('//*[@id="abs"]/div[2]/div[2]/a').map(&:text)
|
61
|
+
result[:abstruct] = page.xpath('//*[@id="abs"]/div[2]/blockquote').children.select{|i| i.name = 'text'}.reverse.shift.text
|
61
62
|
result[:pdfurl] = "#{BASE_URL}#{page.xpath('//*[@id="abs"]/div[1]/div[1]/ul/li[1]/a').attr('href').value}"
|
62
|
-
result[:references] = fetchFromPdfUrl(result[:pdfurl], work_dir, use_dir)
|
63
|
-
return result
|
63
|
+
result[:references] = fetchFromPdfUrl(result[:pdfurl], work_dir, use_dir) if use_pdf
|
64
|
+
return result
|
64
65
|
end
|
65
66
|
|
66
|
-
def self.fetchFromArxivId(id, work_dir, use_dir)
|
67
|
+
def self.fetchFromArxivId(id, work_dir, use_dir, use_pdf)
|
67
68
|
target_url = "#{BASE_URL}/abs/#{id}"
|
68
|
-
fetchFromUrl(target_url, work_dir, use_dir)
|
69
|
+
fetchFromUrl(target_url, work_dir, use_dir, use_pdf)
|
69
70
|
end
|
70
71
|
|
71
72
|
def self.fetchPdfFile(pdfUrl,file_name)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arxiv-references
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Takahiro Nishimura
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -115,7 +115,9 @@ files:
|
|
115
115
|
- bin/setup
|
116
116
|
- exe/arxiv-ref
|
117
117
|
- lib/arxiv/references.rb
|
118
|
+
- lib/arxiv/references/Arxiv.rb
|
118
119
|
- lib/arxiv/references/ArxivReferences.rb
|
120
|
+
- lib/arxiv/references/FetchPaperPDF.rb
|
119
121
|
- lib/arxiv/references/myUtil.rb
|
120
122
|
- lib/arxiv/references/version.rb
|
121
123
|
homepage: https://github.com/nishimuuu/Arxiv-references
|