arxiv-references 0.1.6.5 → 0.1.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2f01c6085fecee426a78a163f1e27d662ac915ab
4
- data.tar.gz: e5f51fe5c4eae1a325fdc98527f571710d746fbd
3
+ metadata.gz: c1fd3a9552e15b293ec4f57b89a2393e840f69d9
4
+ data.tar.gz: 2cee2a9b1dc53d287c44c79cfd6f18e50d109118
5
5
  SHA512:
6
- metadata.gz: 916648916d0a88954972d648496ebdbd07b8a20708fb1e266d2e58cb92b27cb87e637d0fbc28198b74581ecbb0f6cd07e71d88110f78fa207dfcb20f06307f0f
7
- data.tar.gz: da7e9de22b948832613c042590b4da80c7f3c865125cdea68d614f7c0651c8bae48cfa43c71d2b705e5599e06d42f96004e6dd02be16f2e7af36f6a283321f18
6
+ metadata.gz: d6e24ff8fb896d0658fd7526ec62f61078a377f2b3df8ad200c4bd25cab97b0bba34ca6337991b5806eb048967e7e8ca3d17030020adc46ee8873f03612f963f
7
+ data.tar.gz: 46c78134b1b051e495162d4ce659e0ea4f7134991eaf824cd2c0b96c2875c4aa0da4ce64db1dbfb87b2db81e97846b7afc3c797f571d54dfd3a7326c66c90c73
data/README.md CHANGED
@@ -1,3 +1,4 @@
1
+ [![Build Status](https://travis-ci.org/nishimuuu/Arxiv-references.svg?branch=master)](https://travis-ci.org/nishimuuu/Arxiv-references)
1
2
  [![Gem Version](https://badge.fury.io/rb/arxiv-references.svg)](https://badge.fury.io/rb/arxiv-references)
2
3
  [![LICENSES](https://img.shields.io/badge/LICENSE-GPL-blue.svg)](https://img.shields.io/badge/LICENSE-GPL-blue.svg)
3
4
  [![Code Climate](https://codeclimate.com/github/nishimuuu/Arxiv-references/badges/gpa.svg)](https://codeclimate.com/github/nishimuuu/Arxiv-references)
@@ -6,6 +7,11 @@
6
7
 
7
8
  Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/arxiv/references`. To experiment with that code, run `bin/console` for an interactive prompt.
8
9
 
10
+ ## Demo
11
+ [URL](http://153.126.133.121/arxiv-references-api/html/index.html)
12
+
13
+ [API Document](http://153.126.133.121/arxiv-references-api/html/api.html)
14
+
9
15
  ## Dependencies
10
16
 
11
17
  - k2pdfopt (http://www.willus.com/k2pdfopt/)
@@ -46,8 +52,8 @@ Or install it yourself as:
46
52
 
47
53
  ### Options
48
54
  --work_dir : [default: /tmp] working directory to convert multi column pdf to one column
49
- --use_dir : [default: true] create working directory or not
50
-
55
+ --dir : [default: true] create working directory or not
56
+ --pdf : [default: false] if you don't need citations list, add option `--no-pdf`
51
57
 
52
58
  ## Development
53
59
 
@@ -0,0 +1,41 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+ require 'json'
4
+
5
+ class Arxiv
6
+ attr_reader :title, :authors, :abstruct, :pdfurl
7
+ attr_accessor :references
8
+ BASE_URL = 'https://arxiv.org'
9
+ def initialize(id)
10
+ url = "#{BASE_URL}/abs/#{id}" if id.index('http').nil?
11
+ charset = nil
12
+ html = open(url) do |f|
13
+ charset = f.charset
14
+ f.read
15
+ end
16
+ @page = Nokogiri::HTML.parse(html, nil, charset)
17
+ @title = fetch_title
18
+ @authors = fetch_authors
19
+ @abstruct = fetch_abstruct
20
+ @pdfurl = fetch_pdfurl
21
+ @references = nil
22
+ end
23
+
24
+ def fetch_title
25
+ @page.xpath('//*[@id="abs"]/div[2]/h1').children.select{|i| i.name=='text'}.shift.text.gsub(/\n/,'')
26
+ end
27
+
28
+ def fetch_authors
29
+ @page.xpath('//*[@id="abs"]/div[2]/div[2]/a').map(&:text)
30
+ end
31
+
32
+ def fetch_abstruct
33
+ @page.xpath('//*[@id="abs"]/div[2]/blockquote').children.select{|i| i.name = 'text'}.reverse.shift.text
34
+ end
35
+
36
+ def fetch_pdfurl
37
+ "#{BASE_URL}#{@page.xpath('//*[@id="abs"]/div[1]/div[1]/ul/li[1]/a').attr('href').value}"
38
+ end
39
+ end
40
+
41
+
@@ -5,36 +5,30 @@ require 'pathname'
5
5
  lib = Pathname.new(__FILE__).dirname.join().expand_path
6
6
  $:.unshift lib.to_s
7
7
  require 'myUtil'
8
+ require 'json'
8
9
 
9
10
 
10
11
  module ArxivReferences
11
12
  class CLI < Thor
12
13
  include ArxivUtil
14
+ class_option 'work_dir', type: :string, aliases: 'Working dir', desc: 'Set working dir(default: /tmp)', default: '/tmp'
15
+ class_option 'dir', type: :boolean, aliases: 'Working in dir', desc: 'work to make dir or not(default: true)', default: true
16
+ class_option 'pdf', type: :boolean, aliases: 'Parse PDF', desc: 'fetch pdf information(defaut: true)', default: true
17
+
18
+
13
19
  desc 'url', 'Extract references from arxiv URL'
14
- option 'work_dir', type: :string, aliases: '-work', desc: 'Set working dir(default: /tmp)'
15
- option 'use_dir', type: :boolean, aliases: '-use_dir', desc: 'work to make dir or not(default: true)'
16
20
  def url(urlName)
17
- work_dir = options['work_dir'].nil? ? '/tmp' : options['work_dir']
18
- use_dir = options['use_dir'].nil? ? true : options['use_dir']
19
- puts ArxivUtil.fetchFromUrl(urlName, work_dir, use_dir)
21
+ puts ArxivUtil.fetchFromUrl(urlName, options[:work_dir], options[:use_dir], options[:no_pdf]).to_json
20
22
  end
21
23
 
22
24
  desc 'id', 'Extract references from Arxiv id'
23
- option 'work_dir', type: :string, aliases: '-work', desc: 'Set working dir(default: /tmp)'
24
- option 'use_dir', type: :boolean, aliases: '-use_dir', desc: 'work to make dir or not(default: true)'
25
25
  def arxivid(idName)
26
- work_dir = options['work_dir'].nil? ? '/tmp' : options['work_dir']
27
- use_dir = options['use_dir'].nil? ? true : options['use_dir']
28
- puts ArxivUtil.fetchFromArxivId(idName, work_dir, use_dir)
26
+ puts ArxivUtil.fetchFromArxivId(idName, options[:work_dir], options[:use_dir], options[:no_pdf]).to_json
29
27
  end
30
28
 
31
29
  desc 'pdfurl', 'Extract references from pdf URL'
32
- option 'work_dir', type: :string, aliases: '-work', desc: 'Set working dir(default: /tmp)'
33
- option 'use_dir', type: :boolean, aliases: '-use_dir', desc: 'work to make dir or not(default: true)'
34
30
  def pdfurl(pdfUrlName)
35
- work_dir = options['work_dir'].nil? ? '/tmp' : options['work_dir']
36
- use_dir = options['use_dir'].nil? ? true : options['use_dir']
37
- puts ArxivUtil.fetchFromPdfUrl(pdfUrlName, work_dir, use_dir)
31
+ puts ArxivUtil.fetchFromPdfUrl(pdfUrlName, options[:work_dir], options[:use_dir]).to_json
38
32
  end
39
33
  end
40
34
  end
@@ -0,0 +1,80 @@
1
+ require 'digest/sha2'
2
+ require 'time'
3
+ require 'fileutils'
4
+ require 'pty'
5
+ require 'expect'
6
+ require 'pdf-reader'
7
+
8
+ class P3
9
+ def self.fetchPdfFile(pdfUrl,file_name)
10
+ open(file_name, 'wb') do |o|
11
+ open(pdfUrl) do |data|
12
+ o.write(data.read)
13
+ end
14
+ end
15
+ end
16
+
17
+ def self.convertSingleColPdf(job_id, work_dir,file_name, use_dir)
18
+ cmd = "k2pdfopt -dev kpw #{file_name}"
19
+ PTY.spawn(cmd) do |i,o|
20
+ o.sync = true
21
+ i.expect(/\S.*Enter option above \(h=help, q=quit\):/,10){
22
+ o.puts "\n"
23
+ o.flush
24
+ }
25
+ while( i.eof? == false )
26
+ res = i.gets
27
+ print res
28
+ break unless res.index('written').nil?
29
+ end
30
+ end
31
+ return getK2Pdf(job_id, work_dir, use_dir)
32
+ end
33
+
34
+ def self.fetchReference(file_name)
35
+ reader = PDF::Reader.new(file_name)
36
+ page_no = reader.
37
+ pages.
38
+ reject{|i|
39
+ i.text.index(REFERENCE_START_REGEXP).nil?
40
+ }.
41
+ map(&:number).
42
+ sort.
43
+ shift
44
+ puts "Detect References page=> #{page_no} "
45
+ ref_page = reader.
46
+ pages.
47
+ select{|i|
48
+ i.number >= page_no
49
+ }.
50
+ map{|i|
51
+ i.text.gsub(/\n+/,"\n").gsub(/ +/,' ')
52
+ }.
53
+ join(' ').
54
+ gsub(REFERENCE_REGEXP,"\n\\1").
55
+ gsub('- ','').
56
+ split("\n")
57
+
58
+ return ref_page[(ref_page.index{|i| i =~ REFERENCE_START_REGEXP}+1)..ref_page.length].
59
+ select{|i|
60
+ i.length > 5
61
+ }
62
+ end
63
+
64
+ def self.fetchFromPdfUrl(pdfUrl, work_dir, use_dir)
65
+ job_id = makeId
66
+ makeDir(job_id, work_dir) if use_dir
67
+ file_name = makeFile(job_id, work_dir, use_dir)
68
+
69
+ fetchPdfFile(pdfUrl, file_name)
70
+ executed_pdf = convertSingleColPdf(job_id, work_dir, file_name, use_dir)
71
+ references = fetchReference(executed_pdf)
72
+ if use_dir
73
+ removeDir(job_id, work_dir)
74
+ else
75
+ removeFile(job_id, work_dir)
76
+ end
77
+ return references
78
+ end
79
+
80
+ end
@@ -7,6 +7,7 @@ require 'expect'
7
7
  require 'pdf-reader'
8
8
  require 'nokogiri'
9
9
  require 'json'
10
+
10
11
  module ArxivUtil
11
12
  BASE_URL = "https://arxiv.org"
12
13
  REFERENCE_START_REGEXP = Regexp.new('References|REFERENCES|Reference|REFERENCE')
@@ -45,7 +46,7 @@ module ArxivUtil
45
46
  end
46
47
 
47
48
 
48
- def self.fetchFromUrl(urlName, work_dir, use_dir)
49
+ def self.fetchFromUrl(urlName, work_dir, use_dir, use_pdf)
49
50
  puts "fetch => #{urlName}"
50
51
  charset = nil
51
52
  html = open(urlName) do |f|
@@ -55,17 +56,17 @@ module ArxivUtil
55
56
 
56
57
  page = Nokogiri::HTML.parse(html, nil, charset)
57
58
  result = {}
58
- result[:title] = page.xpath('//*[@id="abs"]/div[2]/h1').text
59
- result[:authors] = page.xpath('//*[@id="abs"]/div[2]/div[2]/a').text
60
- result[:abstruct] = page.xpath('//*[@id="abs"]/div[2]/blockquote').text
59
+ result[:title] = page.xpath('//*[@id="abs"]/div[2]/h1').children.select{|i| i.name=='text'}.shift.text.gsub(/\n/,'')
60
+ result[:authors] = page.xpath('//*[@id="abs"]/div[2]/div[2]/a').map(&:text)
61
+ result[:abstruct] = page.xpath('//*[@id="abs"]/div[2]/blockquote').children.select{|i| i.name = 'text'}.reverse.shift.text
61
62
  result[:pdfurl] = "#{BASE_URL}#{page.xpath('//*[@id="abs"]/div[1]/div[1]/ul/li[1]/a').attr('href').value}"
62
- result[:references] = fetchFromPdfUrl(result[:pdfurl], work_dir, use_dir)
63
- return result.to_json
63
+ result[:references] = fetchFromPdfUrl(result[:pdfurl], work_dir, use_dir) if use_pdf
64
+ return result
64
65
  end
65
66
 
66
- def self.fetchFromArxivId(id, work_dir, use_dir)
67
+ def self.fetchFromArxivId(id, work_dir, use_dir, use_pdf)
67
68
  target_url = "#{BASE_URL}/abs/#{id}"
68
- fetchFromUrl(target_url, work_dir, use_dir)
69
+ fetchFromUrl(target_url, work_dir, use_dir, use_pdf)
69
70
  end
70
71
 
71
72
  def self.fetchPdfFile(pdfUrl,file_name)
@@ -1,5 +1,5 @@
1
1
  module Arxiv
2
2
  module References
3
- VERSION = "0.1.6.5"
3
+ VERSION = "0.1.7.0"
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arxiv-references
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6.5
4
+ version: 0.1.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Takahiro Nishimura
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-05-03 00:00:00.000000000 Z
11
+ date: 2016-05-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -115,7 +115,9 @@ files:
115
115
  - bin/setup
116
116
  - exe/arxiv-ref
117
117
  - lib/arxiv/references.rb
118
+ - lib/arxiv/references/Arxiv.rb
118
119
  - lib/arxiv/references/ArxivReferences.rb
120
+ - lib/arxiv/references/FetchPaperPDF.rb
119
121
  - lib/arxiv/references/myUtil.rb
120
122
  - lib/arxiv/references/version.rb
121
123
  homepage: https://github.com/nishimuuu/Arxiv-references