arxiv-references 0.1.7.0 → 0.1.7.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c1fd3a9552e15b293ec4f57b89a2393e840f69d9
4
- data.tar.gz: 2cee2a9b1dc53d287c44c79cfd6f18e50d109118
3
+ metadata.gz: 41b13e1de94b5b60ded925f2621f87e881262b5b
4
+ data.tar.gz: 70882f9a8f74f0c8c549f23bd9c284fc0b24458e
5
5
  SHA512:
6
- metadata.gz: d6e24ff8fb896d0658fd7526ec62f61078a377f2b3df8ad200c4bd25cab97b0bba34ca6337991b5806eb048967e7e8ca3d17030020adc46ee8873f03612f963f
7
- data.tar.gz: 46c78134b1b051e495162d4ce659e0ea4f7134991eaf824cd2c0b96c2875c4aa0da4ce64db1dbfb87b2db81e97846b7afc3c797f571d54dfd3a7326c66c90c73
6
+ metadata.gz: 8439088f8eb532a5b9c0f092db5e0c918e2c49929669c6cbe8b8592b32ac88e7b63e515acac7c7cef099f7b51c09e6b7f959dd6247d39b54308805a38f572265
7
+ data.tar.gz: 9434188bcd8afa713bdc2384bb6c77af5f00ef6c9a06aeabf6f6915e48c35272f5e4f89a8a6159772f052a6487f94642eeb833913817eaec761ee29b3d9119ac
@@ -2,14 +2,14 @@ require 'open-uri'
2
2
  require 'nokogiri'
3
3
  require 'json'
4
4
 
5
- class Arxiv
5
+ class ArxivApi
6
6
  attr_reader :title, :authors, :abstruct, :pdfurl
7
7
  attr_accessor :references
8
8
  BASE_URL = 'https://arxiv.org'
9
9
  def initialize(id)
10
- url = "#{BASE_URL}/abs/#{id}" if id.index('http').nil?
10
+ id = "#{BASE_URL}/abs/#{id}" if id.index('http').nil?
11
11
  charset = nil
12
- html = open(url) do |f|
12
+ html = open(id) do |f|
13
13
  charset = f.charset
14
14
  f.read
15
15
  end
@@ -36,6 +36,11 @@ class Arxiv
36
36
  def fetch_pdfurl
37
37
  "#{BASE_URL}#{@page.xpath('//*[@id="abs"]/div[1]/div[1]/ul/li[1]/a').attr('href').value}"
38
38
  end
39
+
40
+ def to_json
41
+ JSON.pretty_generate({title: @title, authors: @authors, abstruct: @abstruct, pdfurl: @pdfurl, references: @references})
42
+ end
43
+
39
44
  end
40
45
 
41
46
 
@@ -4,7 +4,7 @@ require 'thor'
4
4
  require 'pathname'
5
5
  lib = Pathname.new(__FILE__).dirname.join().expand_path
6
6
  $:.unshift lib.to_s
7
- require 'myUtil'
7
+ require 'ArxivUtil'
8
8
  require 'json'
9
9
 
10
10
 
@@ -0,0 +1,23 @@
1
+
2
+ $:.unshift Pathname.new(__FILE__).dirname.join().expand_path.to_s
3
+ require 'ArxivApi'
4
+ require 'P3'
5
+
6
+
7
+ module ArxivUtil
8
+ BASE_URL = "https://arxiv.org"
9
+ def self.fetchFromUrl(urlName, work_dir, use_dir, use_pdf)
10
+ arxiv = ArxivApi.new(urlName)
11
+ arxiv.references = P3.fetchFromPdfUrl(arxiv.pdfurl, work_dir, use_dir) if use_pdf || use_pdf.nil?
12
+ return arxiv
13
+ end
14
+
15
+ def self.fetchFromArxivId(id, work_dir, use_dir, use_pdf)
16
+ target_url = "#{BASE_URL}/abs/#{id}"
17
+ fetchFromUrl(target_url, work_dir, use_dir, use_pdf)
18
+ end
19
+
20
+ def self.fetchFromPdfUrl(pdfUrl, work_dir, use_dir)
21
+ return P3.fetchFromPdfUrl(pdfUrl, work_dir, use_dir)
22
+ end
23
+ end
@@ -1,16 +1,13 @@
1
- require 'open-uri'
2
1
  require 'digest/sha2'
3
2
  require 'time'
4
3
  require 'fileutils'
5
4
  require 'pty'
6
5
  require 'expect'
7
6
  require 'pdf-reader'
8
- require 'nokogiri'
9
- require 'json'
10
7
 
11
- module ArxivUtil
8
+ class P3
12
9
  BASE_URL = "https://arxiv.org"
13
- REFERENCE_START_REGEXP = Regexp.new('References|REFERENCES|Reference|REFERENCE')
10
+ REFERENCE_START_REGEXP = Regexp.new('[rR][eE][fF][eE][rR][eE][nN][cC][eE][sS]*')
14
11
  REFERENCE_REGEXP = Regexp.new('(\[[0-9]?[0-9]\]|\[.+?\])')
15
12
  def self.makeId
16
13
  return Digest::SHA256.hexdigest Time.now.strftime("%F %H:%M:%S")
@@ -44,31 +41,6 @@ module ArxivUtil
44
41
  File.delete("#{work_dir}/#{id}-output.pdf")
45
42
  File.delete("#{work_dir}/#{id}-output_k2opt.pdf")
46
43
  end
47
-
48
-
49
- def self.fetchFromUrl(urlName, work_dir, use_dir, use_pdf)
50
- puts "fetch => #{urlName}"
51
- charset = nil
52
- html = open(urlName) do |f|
53
- charset = f.charset
54
- f.read
55
- end
56
-
57
- page = Nokogiri::HTML.parse(html, nil, charset)
58
- result = {}
59
- result[:title] = page.xpath('//*[@id="abs"]/div[2]/h1').children.select{|i| i.name=='text'}.shift.text.gsub(/\n/,'')
60
- result[:authors] = page.xpath('//*[@id="abs"]/div[2]/div[2]/a').map(&:text)
61
- result[:abstruct] = page.xpath('//*[@id="abs"]/div[2]/blockquote').children.select{|i| i.name = 'text'}.reverse.shift.text
62
- result[:pdfurl] = "#{BASE_URL}#{page.xpath('//*[@id="abs"]/div[1]/div[1]/ul/li[1]/a').attr('href').value}"
63
- result[:references] = fetchFromPdfUrl(result[:pdfurl], work_dir, use_dir) if use_pdf
64
- return result
65
- end
66
-
67
- def self.fetchFromArxivId(id, work_dir, use_dir, use_pdf)
68
- target_url = "#{BASE_URL}/abs/#{id}"
69
- fetchFromUrl(target_url, work_dir, use_dir, use_pdf)
70
- end
71
-
72
44
  def self.fetchPdfFile(pdfUrl,file_name)
73
45
  open(file_name, 'wb') do |o|
74
46
  open(pdfUrl) do |data|
@@ -94,7 +66,6 @@ module ArxivUtil
94
66
  return getK2Pdf(job_id, work_dir, use_dir)
95
67
  end
96
68
 
97
-
98
69
  def self.fetchReference(file_name)
99
70
  reader = PDF::Reader.new(file_name)
100
71
  page_no = reader.
@@ -105,27 +76,25 @@ module ArxivUtil
105
76
  map(&:number).
106
77
  sort.
107
78
  shift
108
- puts "Detect References page=> #{page_no} "
79
+
109
80
  ref_page = reader.
110
81
  pages.
111
82
  select{|i|
112
83
  i.number >= page_no
113
84
  }.
114
85
  map{|i|
115
- i.text.gsub(/\n+/,"\n").gsub(/ +/,' ')
86
+ i.text.gsub(/\n\n+/,"\n").gsub(/ +/,' ').gsub(/-\n +/,'')
116
87
  }.
117
88
  join(' ').
89
+ split("\n").
90
+ join(' ').
118
91
  gsub(REFERENCE_REGEXP,"\n\\1").
119
- gsub('- ','').
120
- split("\n")
121
-
122
- return ref_page[(ref_page.index{|i| i =~ REFERENCE_START_REGEXP}+1)..ref_page.length].
123
- select{|i|
124
- i.length > 5
125
- }
92
+ split("\n").
93
+ select{|i| i.length > 15}
94
+ return ref_page
126
95
  end
127
96
 
128
- def self.fetchFromPdfUrl(pdfUrl, work_dir, use_dir)
97
+ def self.fetchFromPdfUrl(pdfUrl, work_dir=true, use_dir=true)
129
98
  job_id = makeId
130
99
  makeDir(job_id, work_dir) if use_dir
131
100
  file_name = makeFile(job_id, work_dir, use_dir)
@@ -1,5 +1,5 @@
1
1
  module Arxiv
2
2
  module References
3
- VERSION = "0.1.7.0"
3
+ VERSION = "0.1.7.1"
4
4
  end
5
5
  end
@@ -4,7 +4,6 @@ lib = Pathname.new(__FILE__).dirname.join().expand_path.to_s
4
4
  $:.unshift lib
5
5
 
6
6
  require "references/version"
7
- require 'references/myUtil'
8
7
  require 'references/ArxivReferences'
9
8
  module Arxiv
10
9
  module References
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arxiv-references
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7.0
4
+ version: 0.1.7.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Takahiro Nishimura
@@ -115,10 +115,10 @@ files:
115
115
  - bin/setup
116
116
  - exe/arxiv-ref
117
117
  - lib/arxiv/references.rb
118
- - lib/arxiv/references/Arxiv.rb
118
+ - lib/arxiv/references/ArxivApi.rb
119
119
  - lib/arxiv/references/ArxivReferences.rb
120
- - lib/arxiv/references/FetchPaperPDF.rb
121
- - lib/arxiv/references/myUtil.rb
120
+ - lib/arxiv/references/ArxivUtil.rb
121
+ - lib/arxiv/references/P3.rb
122
122
  - lib/arxiv/references/version.rb
123
123
  homepage: https://github.com/nishimuuu/Arxiv-references
124
124
  licenses:
@@ -1,80 +0,0 @@
1
- require 'digest/sha2'
2
- require 'time'
3
- require 'fileutils'
4
- require 'pty'
5
- require 'expect'
6
- require 'pdf-reader'
7
-
8
- class P3
9
- def self.fetchPdfFile(pdfUrl,file_name)
10
- open(file_name, 'wb') do |o|
11
- open(pdfUrl) do |data|
12
- o.write(data.read)
13
- end
14
- end
15
- end
16
-
17
- def self.convertSingleColPdf(job_id, work_dir,file_name, use_dir)
18
- cmd = "k2pdfopt -dev kpw #{file_name}"
19
- PTY.spawn(cmd) do |i,o|
20
- o.sync = true
21
- i.expect(/\S.*Enter option above \(h=help, q=quit\):/,10){
22
- o.puts "\n"
23
- o.flush
24
- }
25
- while( i.eof? == false )
26
- res = i.gets
27
- print res
28
- break unless res.index('written').nil?
29
- end
30
- end
31
- return getK2Pdf(job_id, work_dir, use_dir)
32
- end
33
-
34
- def self.fetchReference(file_name)
35
- reader = PDF::Reader.new(file_name)
36
- page_no = reader.
37
- pages.
38
- reject{|i|
39
- i.text.index(REFERENCE_START_REGEXP).nil?
40
- }.
41
- map(&:number).
42
- sort.
43
- shift
44
- puts "Detect References page=> #{page_no} "
45
- ref_page = reader.
46
- pages.
47
- select{|i|
48
- i.number >= page_no
49
- }.
50
- map{|i|
51
- i.text.gsub(/\n+/,"\n").gsub(/ +/,' ')
52
- }.
53
- join(' ').
54
- gsub(REFERENCE_REGEXP,"\n\\1").
55
- gsub('- ','').
56
- split("\n")
57
-
58
- return ref_page[(ref_page.index{|i| i =~ REFERENCE_START_REGEXP}+1)..ref_page.length].
59
- select{|i|
60
- i.length > 5
61
- }
62
- end
63
-
64
- def self.fetchFromPdfUrl(pdfUrl, work_dir, use_dir)
65
- job_id = makeId
66
- makeDir(job_id, work_dir) if use_dir
67
- file_name = makeFile(job_id, work_dir, use_dir)
68
-
69
- fetchPdfFile(pdfUrl, file_name)
70
- executed_pdf = convertSingleColPdf(job_id, work_dir, file_name, use_dir)
71
- references = fetchReference(executed_pdf)
72
- if use_dir
73
- removeDir(job_id, work_dir)
74
- else
75
- removeFile(job_id, work_dir)
76
- end
77
- return references
78
- end
79
-
80
- end