arxiv-references 0.1.6.5 → 0.1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2f01c6085fecee426a78a163f1e27d662ac915ab
4
- data.tar.gz: e5f51fe5c4eae1a325fdc98527f571710d746fbd
3
+ metadata.gz: c1fd3a9552e15b293ec4f57b89a2393e840f69d9
4
+ data.tar.gz: 2cee2a9b1dc53d287c44c79cfd6f18e50d109118
5
5
  SHA512:
6
- metadata.gz: 916648916d0a88954972d648496ebdbd07b8a20708fb1e266d2e58cb92b27cb87e637d0fbc28198b74581ecbb0f6cd07e71d88110f78fa207dfcb20f06307f0f
7
- data.tar.gz: da7e9de22b948832613c042590b4da80c7f3c865125cdea68d614f7c0651c8bae48cfa43c71d2b705e5599e06d42f96004e6dd02be16f2e7af36f6a283321f18
6
+ metadata.gz: d6e24ff8fb896d0658fd7526ec62f61078a377f2b3df8ad200c4bd25cab97b0bba34ca6337991b5806eb048967e7e8ca3d17030020adc46ee8873f03612f963f
7
+ data.tar.gz: 46c78134b1b051e495162d4ce659e0ea4f7134991eaf824cd2c0b96c2875c4aa0da4ce64db1dbfb87b2db81e97846b7afc3c797f571d54dfd3a7326c66c90c73
data/README.md CHANGED
@@ -1,3 +1,4 @@
1
+ [![Build Status](https://travis-ci.org/nishimuuu/Arxiv-references.svg?branch=master)](https://travis-ci.org/nishimuuu/Arxiv-references)
1
2
  [![Gem Version](https://badge.fury.io/rb/arxiv-references.svg)](https://badge.fury.io/rb/arxiv-references)
2
3
  [![LICENSES](https://img.shields.io/badge/LICENSE-GPL-blue.svg)](https://img.shields.io/badge/LICENSE-GPL-blue.svg)
3
4
  [![Code Climate](https://codeclimate.com/github/nishimuuu/Arxiv-references/badges/gpa.svg)](https://codeclimate.com/github/nishimuuu/Arxiv-references)
@@ -6,6 +7,11 @@
6
7
 
7
8
  Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/arxiv/references`. To experiment with that code, run `bin/console` for an interactive prompt.
8
9
 
10
+ ## Demo
11
+ [URL](http://153.126.133.121/arxiv-references-api/html/index.html)
12
+
13
+ [API Document](http://153.126.133.121/arxiv-references-api/html/api.html)
14
+
9
15
  ## Dependencies
10
16
 
11
17
  - k2pdfopt (http://www.willus.com/k2pdfopt/)
@@ -46,8 +52,8 @@ Or install it yourself as:
46
52
 
47
53
  ### Options
48
54
  --work_dir : [default: /tmp] working directory to convert multi column pdf to one column
49
- --use_dir : [default: true] create working directory or not
50
-
55
+ --dir : [default: true] create working directory or not
56
+ --pdf : [default: false] if you don't need citations list, add option `--no-pdf`
51
57
 
52
58
  ## Development
53
59
 
@@ -0,0 +1,41 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+ require 'json'
4
+
5
+ class Arxiv
6
+ attr_reader :title, :authors, :abstruct, :pdfurl
7
+ attr_accessor :references
8
+ BASE_URL = 'https://arxiv.org'
9
+ def initialize(id)
10
+ url = "#{BASE_URL}/abs/#{id}" if id.index('http').nil?
11
+ charset = nil
12
+ html = open(url) do |f|
13
+ charset = f.charset
14
+ f.read
15
+ end
16
+ @page = Nokogiri::HTML.parse(html, nil, charset)
17
+ @title = fetch_title
18
+ @authors = fetch_authors
19
+ @abstruct = fetch_abstruct
20
+ @pdfurl = fetch_pdfurl
21
+ @references = nil
22
+ end
23
+
24
+ def fetch_title
25
+ @page.xpath('//*[@id="abs"]/div[2]/h1').children.select{|i| i.name=='text'}.shift.text.gsub(/\n/,'')
26
+ end
27
+
28
+ def fetch_authors
29
+ @page.xpath('//*[@id="abs"]/div[2]/div[2]/a').map(&:text)
30
+ end
31
+
32
+ def fetch_abstruct
33
+ @page.xpath('//*[@id="abs"]/div[2]/blockquote').children.select{|i| i.name = 'text'}.reverse.shift.text
34
+ end
35
+
36
+ def fetch_pdfurl
37
+ "#{BASE_URL}#{@page.xpath('//*[@id="abs"]/div[1]/div[1]/ul/li[1]/a').attr('href').value}"
38
+ end
39
+ end
40
+
41
+
@@ -5,36 +5,30 @@ require 'pathname'
5
5
  lib = Pathname.new(__FILE__).dirname.join().expand_path
6
6
  $:.unshift lib.to_s
7
7
  require 'myUtil'
8
+ require 'json'
8
9
 
9
10
 
10
11
  module ArxivReferences
11
12
  class CLI < Thor
12
13
  include ArxivUtil
14
+ class_option 'work_dir', type: :string, aliases: 'Working dir', desc: 'Set working dir(default: /tmp)', default: '/tmp'
15
+ class_option 'dir', type: :boolean, aliases: 'Working in dir', desc: 'work to make dir or not(default: true)', default: true
16
+ class_option 'pdf', type: :boolean, aliases: 'Parse PDF', desc: 'fetch pdf information(defaut: true)', default: true
17
+
18
+
13
19
  desc 'url', 'Extract references from arxiv URL'
14
- option 'work_dir', type: :string, aliases: '-work', desc: 'Set working dir(default: /tmp)'
15
- option 'use_dir', type: :boolean, aliases: '-use_dir', desc: 'work to make dir or not(default: true)'
16
20
  def url(urlName)
17
- work_dir = options['work_dir'].nil? ? '/tmp' : options['work_dir']
18
- use_dir = options['use_dir'].nil? ? true : options['use_dir']
19
- puts ArxivUtil.fetchFromUrl(urlName, work_dir, use_dir)
21
+ puts ArxivUtil.fetchFromUrl(urlName, options[:work_dir], options[:use_dir], options[:no_pdf]).to_json
20
22
  end
21
23
 
22
24
  desc 'id', 'Extract references from Arxiv id'
23
- option 'work_dir', type: :string, aliases: '-work', desc: 'Set working dir(default: /tmp)'
24
- option 'use_dir', type: :boolean, aliases: '-use_dir', desc: 'work to make dir or not(default: true)'
25
25
  def arxivid(idName)
26
- work_dir = options['work_dir'].nil? ? '/tmp' : options['work_dir']
27
- use_dir = options['use_dir'].nil? ? true : options['use_dir']
28
- puts ArxivUtil.fetchFromArxivId(idName, work_dir, use_dir)
26
+ puts ArxivUtil.fetchFromArxivId(idName, options[:work_dir], options[:use_dir], options[:no_pdf]).to_json
29
27
  end
30
28
 
31
29
  desc 'pdfurl', 'Extract references from pdf URL'
32
- option 'work_dir', type: :string, aliases: '-work', desc: 'Set working dir(default: /tmp)'
33
- option 'use_dir', type: :boolean, aliases: '-use_dir', desc: 'work to make dir or not(default: true)'
34
30
  def pdfurl(pdfUrlName)
35
- work_dir = options['work_dir'].nil? ? '/tmp' : options['work_dir']
36
- use_dir = options['use_dir'].nil? ? true : options['use_dir']
37
- puts ArxivUtil.fetchFromPdfUrl(pdfUrlName, work_dir, use_dir)
31
+ puts ArxivUtil.fetchFromPdfUrl(pdfUrlName, options[:work_dir], options[:use_dir]).to_json
38
32
  end
39
33
  end
40
34
  end
@@ -0,0 +1,80 @@
1
+ require 'digest/sha2'
2
+ require 'time'
3
+ require 'fileutils'
4
+ require 'pty'
5
+ require 'expect'
6
+ require 'pdf-reader'
7
+
8
+ class P3
9
+ def self.fetchPdfFile(pdfUrl,file_name)
10
+ open(file_name, 'wb') do |o|
11
+ open(pdfUrl) do |data|
12
+ o.write(data.read)
13
+ end
14
+ end
15
+ end
16
+
17
+ def self.convertSingleColPdf(job_id, work_dir,file_name, use_dir)
18
+ cmd = "k2pdfopt -dev kpw #{file_name}"
19
+ PTY.spawn(cmd) do |i,o|
20
+ o.sync = true
21
+ i.expect(/\S.*Enter option above \(h=help, q=quit\):/,10){
22
+ o.puts "\n"
23
+ o.flush
24
+ }
25
+ while( i.eof? == false )
26
+ res = i.gets
27
+ print res
28
+ break unless res.index('written').nil?
29
+ end
30
+ end
31
+ return getK2Pdf(job_id, work_dir, use_dir)
32
+ end
33
+
34
+ def self.fetchReference(file_name)
35
+ reader = PDF::Reader.new(file_name)
36
+ page_no = reader.
37
+ pages.
38
+ reject{|i|
39
+ i.text.index(REFERENCE_START_REGEXP).nil?
40
+ }.
41
+ map(&:number).
42
+ sort.
43
+ shift
44
+ puts "Detect References page=> #{page_no} "
45
+ ref_page = reader.
46
+ pages.
47
+ select{|i|
48
+ i.number >= page_no
49
+ }.
50
+ map{|i|
51
+ i.text.gsub(/\n+/,"\n").gsub(/ +/,' ')
52
+ }.
53
+ join(' ').
54
+ gsub(REFERENCE_REGEXP,"\n\\1").
55
+ gsub('- ','').
56
+ split("\n")
57
+
58
+ return ref_page[(ref_page.index{|i| i =~ REFERENCE_START_REGEXP}+1)..ref_page.length].
59
+ select{|i|
60
+ i.length > 5
61
+ }
62
+ end
63
+
64
+ def self.fetchFromPdfUrl(pdfUrl, work_dir, use_dir)
65
+ job_id = makeId
66
+ makeDir(job_id, work_dir) if use_dir
67
+ file_name = makeFile(job_id, work_dir, use_dir)
68
+
69
+ fetchPdfFile(pdfUrl, file_name)
70
+ executed_pdf = convertSingleColPdf(job_id, work_dir, file_name, use_dir)
71
+ references = fetchReference(executed_pdf)
72
+ if use_dir
73
+ removeDir(job_id, work_dir)
74
+ else
75
+ removeFile(job_id, work_dir)
76
+ end
77
+ return references
78
+ end
79
+
80
+ end
@@ -7,6 +7,7 @@ require 'expect'
7
7
  require 'pdf-reader'
8
8
  require 'nokogiri'
9
9
  require 'json'
10
+
10
11
  module ArxivUtil
11
12
  BASE_URL = "https://arxiv.org"
12
13
  REFERENCE_START_REGEXP = Regexp.new('References|REFERENCES|Reference|REFERENCE')
@@ -45,7 +46,7 @@ module ArxivUtil
45
46
  end
46
47
 
47
48
 
48
- def self.fetchFromUrl(urlName, work_dir, use_dir)
49
+ def self.fetchFromUrl(urlName, work_dir, use_dir, use_pdf)
49
50
  puts "fetch => #{urlName}"
50
51
  charset = nil
51
52
  html = open(urlName) do |f|
@@ -55,17 +56,17 @@ module ArxivUtil
55
56
 
56
57
  page = Nokogiri::HTML.parse(html, nil, charset)
57
58
  result = {}
58
- result[:title] = page.xpath('//*[@id="abs"]/div[2]/h1').text
59
- result[:authors] = page.xpath('//*[@id="abs"]/div[2]/div[2]/a').text
60
- result[:abstruct] = page.xpath('//*[@id="abs"]/div[2]/blockquote').text
59
+ result[:title] = page.xpath('//*[@id="abs"]/div[2]/h1').children.select{|i| i.name=='text'}.shift.text.gsub(/\n/,'')
60
+ result[:authors] = page.xpath('//*[@id="abs"]/div[2]/div[2]/a').map(&:text)
61
+ result[:abstruct] = page.xpath('//*[@id="abs"]/div[2]/blockquote').children.select{|i| i.name = 'text'}.reverse.shift.text
61
62
  result[:pdfurl] = "#{BASE_URL}#{page.xpath('//*[@id="abs"]/div[1]/div[1]/ul/li[1]/a').attr('href').value}"
62
- result[:references] = fetchFromPdfUrl(result[:pdfurl], work_dir, use_dir)
63
- return result.to_json
63
+ result[:references] = fetchFromPdfUrl(result[:pdfurl], work_dir, use_dir) if use_pdf
64
+ return result
64
65
  end
65
66
 
66
- def self.fetchFromArxivId(id, work_dir, use_dir)
67
+ def self.fetchFromArxivId(id, work_dir, use_dir, use_pdf)
67
68
  target_url = "#{BASE_URL}/abs/#{id}"
68
- fetchFromUrl(target_url, work_dir, use_dir)
69
+ fetchFromUrl(target_url, work_dir, use_dir, use_pdf)
69
70
  end
70
71
 
71
72
  def self.fetchPdfFile(pdfUrl,file_name)
@@ -1,5 +1,5 @@
1
1
  module Arxiv
2
2
  module References
3
- VERSION = "0.1.6.5"
3
+ VERSION = "0.1.7.0"
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arxiv-references
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6.5
4
+ version: 0.1.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Takahiro Nishimura
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-05-03 00:00:00.000000000 Z
11
+ date: 2016-05-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -115,7 +115,9 @@ files:
115
115
  - bin/setup
116
116
  - exe/arxiv-ref
117
117
  - lib/arxiv/references.rb
118
+ - lib/arxiv/references/Arxiv.rb
118
119
  - lib/arxiv/references/ArxivReferences.rb
120
+ - lib/arxiv/references/FetchPaperPDF.rb
119
121
  - lib/arxiv/references/myUtil.rb
120
122
  - lib/arxiv/references/version.rb
121
123
  homepage: https://github.com/nishimuuu/Arxiv-references