arxiv-references 0.1.7.0 → 0.1.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c1fd3a9552e15b293ec4f57b89a2393e840f69d9
4
- data.tar.gz: 2cee2a9b1dc53d287c44c79cfd6f18e50d109118
3
+ metadata.gz: 41b13e1de94b5b60ded925f2621f87e881262b5b
4
+ data.tar.gz: 70882f9a8f74f0c8c549f23bd9c284fc0b24458e
5
5
  SHA512:
6
- metadata.gz: d6e24ff8fb896d0658fd7526ec62f61078a377f2b3df8ad200c4bd25cab97b0bba34ca6337991b5806eb048967e7e8ca3d17030020adc46ee8873f03612f963f
7
- data.tar.gz: 46c78134b1b051e495162d4ce659e0ea4f7134991eaf824cd2c0b96c2875c4aa0da4ce64db1dbfb87b2db81e97846b7afc3c797f571d54dfd3a7326c66c90c73
6
+ metadata.gz: 8439088f8eb532a5b9c0f092db5e0c918e2c49929669c6cbe8b8592b32ac88e7b63e515acac7c7cef099f7b51c09e6b7f959dd6247d39b54308805a38f572265
7
+ data.tar.gz: 9434188bcd8afa713bdc2384bb6c77af5f00ef6c9a06aeabf6f6915e48c35272f5e4f89a8a6159772f052a6487f94642eeb833913817eaec761ee29b3d9119ac
@@ -2,14 +2,14 @@ require 'open-uri'
2
2
  require 'nokogiri'
3
3
  require 'json'
4
4
 
5
- class Arxiv
5
+ class ArxivApi
6
6
  attr_reader :title, :authors, :abstruct, :pdfurl
7
7
  attr_accessor :references
8
8
  BASE_URL = 'https://arxiv.org'
9
9
  def initialize(id)
10
- url = "#{BASE_URL}/abs/#{id}" if id.index('http').nil?
10
+ id = "#{BASE_URL}/abs/#{id}" if id.index('http').nil?
11
11
  charset = nil
12
- html = open(url) do |f|
12
+ html = open(id) do |f|
13
13
  charset = f.charset
14
14
  f.read
15
15
  end
@@ -36,6 +36,11 @@ class Arxiv
36
36
  def fetch_pdfurl
37
37
  "#{BASE_URL}#{@page.xpath('//*[@id="abs"]/div[1]/div[1]/ul/li[1]/a').attr('href').value}"
38
38
  end
39
+
40
+ def to_json
41
+ JSON.pretty_generate({title: @title, authors: @authors, abstruct: @abstruct, pdfurl: @pdfurl, references: @references})
42
+ end
43
+
39
44
  end
40
45
 
41
46
 
@@ -4,7 +4,7 @@ require 'thor'
4
4
  require 'pathname'
5
5
  lib = Pathname.new(__FILE__).dirname.join().expand_path
6
6
  $:.unshift lib.to_s
7
- require 'myUtil'
7
+ require 'ArxivUtil'
8
8
  require 'json'
9
9
 
10
10
 
@@ -0,0 +1,23 @@
1
+
2
+ $:.unshift Pathname.new(__FILE__).dirname.join().expand_path.to_s
3
+ require 'ArxivApi'
4
+ require 'P3'
5
+
6
+
7
+ module ArxivUtil
8
+ BASE_URL = "https://arxiv.org"
9
+ def self.fetchFromUrl(urlName, work_dir, use_dir, use_pdf)
10
+ arxiv = ArxivApi.new(urlName)
11
+ arxiv.references = P3.fetchFromPdfUrl(arxiv.pdfurl, work_dir, use_dir) if use_pdf || use_pdf.nil?
12
+ return arxiv
13
+ end
14
+
15
+ def self.fetchFromArxivId(id, work_dir, use_dir, use_pdf)
16
+ target_url = "#{BASE_URL}/abs/#{id}"
17
+ fetchFromUrl(target_url, work_dir, use_dir, use_pdf)
18
+ end
19
+
20
+ def self.fetchFromPdfUrl(pdfUrl, work_dir, use_dir)
21
+ return P3.fetchFromPdfUrl(pdfUrl, work_dir, use_dir)
22
+ end
23
+ end
@@ -1,16 +1,13 @@
1
- require 'open-uri'
2
1
  require 'digest/sha2'
3
2
  require 'time'
4
3
  require 'fileutils'
5
4
  require 'pty'
6
5
  require 'expect'
7
6
  require 'pdf-reader'
8
- require 'nokogiri'
9
- require 'json'
10
7
 
11
- module ArxivUtil
8
+ class P3
12
9
  BASE_URL = "https://arxiv.org"
13
- REFERENCE_START_REGEXP = Regexp.new('References|REFERENCES|Reference|REFERENCE')
10
+ REFERENCE_START_REGEXP = Regexp.new('[rR][eE][fF][eE][rR][eE][nN][cC][eE][sS]*')
14
11
  REFERENCE_REGEXP = Regexp.new('(\[[0-9]?[0-9]\]|\[.+?\])')
15
12
  def self.makeId
16
13
  return Digest::SHA256.hexdigest Time.now.strftime("%F %H:%M:%S")
@@ -44,31 +41,6 @@ module ArxivUtil
44
41
  File.delete("#{work_dir}/#{id}-output.pdf")
45
42
  File.delete("#{work_dir}/#{id}-output_k2opt.pdf")
46
43
  end
47
-
48
-
49
- def self.fetchFromUrl(urlName, work_dir, use_dir, use_pdf)
50
- puts "fetch => #{urlName}"
51
- charset = nil
52
- html = open(urlName) do |f|
53
- charset = f.charset
54
- f.read
55
- end
56
-
57
- page = Nokogiri::HTML.parse(html, nil, charset)
58
- result = {}
59
- result[:title] = page.xpath('//*[@id="abs"]/div[2]/h1').children.select{|i| i.name=='text'}.shift.text.gsub(/\n/,'')
60
- result[:authors] = page.xpath('//*[@id="abs"]/div[2]/div[2]/a').map(&:text)
61
- result[:abstruct] = page.xpath('//*[@id="abs"]/div[2]/blockquote').children.select{|i| i.name = 'text'}.reverse.shift.text
62
- result[:pdfurl] = "#{BASE_URL}#{page.xpath('//*[@id="abs"]/div[1]/div[1]/ul/li[1]/a').attr('href').value}"
63
- result[:references] = fetchFromPdfUrl(result[:pdfurl], work_dir, use_dir) if use_pdf
64
- return result
65
- end
66
-
67
- def self.fetchFromArxivId(id, work_dir, use_dir, use_pdf)
68
- target_url = "#{BASE_URL}/abs/#{id}"
69
- fetchFromUrl(target_url, work_dir, use_dir, use_pdf)
70
- end
71
-
72
44
  def self.fetchPdfFile(pdfUrl,file_name)
73
45
  open(file_name, 'wb') do |o|
74
46
  open(pdfUrl) do |data|
@@ -94,7 +66,6 @@ module ArxivUtil
94
66
  return getK2Pdf(job_id, work_dir, use_dir)
95
67
  end
96
68
 
97
-
98
69
  def self.fetchReference(file_name)
99
70
  reader = PDF::Reader.new(file_name)
100
71
  page_no = reader.
@@ -105,27 +76,25 @@ module ArxivUtil
105
76
  map(&:number).
106
77
  sort.
107
78
  shift
108
- puts "Detect References page=> #{page_no} "
79
+
109
80
  ref_page = reader.
110
81
  pages.
111
82
  select{|i|
112
83
  i.number >= page_no
113
84
  }.
114
85
  map{|i|
115
- i.text.gsub(/\n+/,"\n").gsub(/ +/,' ')
86
+ i.text.gsub(/\n\n+/,"\n").gsub(/ +/,' ').gsub(/-\n +/,'')
116
87
  }.
117
88
  join(' ').
89
+ split("\n").
90
+ join(' ').
118
91
  gsub(REFERENCE_REGEXP,"\n\\1").
119
- gsub('- ','').
120
- split("\n")
121
-
122
- return ref_page[(ref_page.index{|i| i =~ REFERENCE_START_REGEXP}+1)..ref_page.length].
123
- select{|i|
124
- i.length > 5
125
- }
92
+ split("\n").
93
+ select{|i| i.length > 15}
94
+ return ref_page
126
95
  end
127
96
 
128
- def self.fetchFromPdfUrl(pdfUrl, work_dir, use_dir)
97
+ def self.fetchFromPdfUrl(pdfUrl, work_dir=true, use_dir=true)
129
98
  job_id = makeId
130
99
  makeDir(job_id, work_dir) if use_dir
131
100
  file_name = makeFile(job_id, work_dir, use_dir)
@@ -1,5 +1,5 @@
1
1
  module Arxiv
2
2
  module References
3
- VERSION = "0.1.7.0"
3
+ VERSION = "0.1.7.1"
4
4
  end
5
5
  end
@@ -4,7 +4,6 @@ lib = Pathname.new(__FILE__).dirname.join().expand_path.to_s
4
4
  $:.unshift lib
5
5
 
6
6
  require "references/version"
7
- require 'references/myUtil'
8
7
  require 'references/ArxivReferences'
9
8
  module Arxiv
10
9
  module References
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arxiv-references
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7.0
4
+ version: 0.1.7.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Takahiro Nishimura
@@ -115,10 +115,10 @@ files:
115
115
  - bin/setup
116
116
  - exe/arxiv-ref
117
117
  - lib/arxiv/references.rb
118
- - lib/arxiv/references/Arxiv.rb
118
+ - lib/arxiv/references/ArxivApi.rb
119
119
  - lib/arxiv/references/ArxivReferences.rb
120
- - lib/arxiv/references/FetchPaperPDF.rb
121
- - lib/arxiv/references/myUtil.rb
120
+ - lib/arxiv/references/ArxivUtil.rb
121
+ - lib/arxiv/references/P3.rb
122
122
  - lib/arxiv/references/version.rb
123
123
  homepage: https://github.com/nishimuuu/Arxiv-references
124
124
  licenses:
@@ -1,80 +0,0 @@
1
- require 'digest/sha2'
2
- require 'time'
3
- require 'fileutils'
4
- require 'pty'
5
- require 'expect'
6
- require 'pdf-reader'
7
-
8
- class P3
9
- def self.fetchPdfFile(pdfUrl,file_name)
10
- open(file_name, 'wb') do |o|
11
- open(pdfUrl) do |data|
12
- o.write(data.read)
13
- end
14
- end
15
- end
16
-
17
- def self.convertSingleColPdf(job_id, work_dir,file_name, use_dir)
18
- cmd = "k2pdfopt -dev kpw #{file_name}"
19
- PTY.spawn(cmd) do |i,o|
20
- o.sync = true
21
- i.expect(/\S.*Enter option above \(h=help, q=quit\):/,10){
22
- o.puts "\n"
23
- o.flush
24
- }
25
- while( i.eof? == false )
26
- res = i.gets
27
- print res
28
- break unless res.index('written').nil?
29
- end
30
- end
31
- return getK2Pdf(job_id, work_dir, use_dir)
32
- end
33
-
34
- def self.fetchReference(file_name)
35
- reader = PDF::Reader.new(file_name)
36
- page_no = reader.
37
- pages.
38
- reject{|i|
39
- i.text.index(REFERENCE_START_REGEXP).nil?
40
- }.
41
- map(&:number).
42
- sort.
43
- shift
44
- puts "Detect References page=> #{page_no} "
45
- ref_page = reader.
46
- pages.
47
- select{|i|
48
- i.number >= page_no
49
- }.
50
- map{|i|
51
- i.text.gsub(/\n+/,"\n").gsub(/ +/,' ')
52
- }.
53
- join(' ').
54
- gsub(REFERENCE_REGEXP,"\n\\1").
55
- gsub('- ','').
56
- split("\n")
57
-
58
- return ref_page[(ref_page.index{|i| i =~ REFERENCE_START_REGEXP}+1)..ref_page.length].
59
- select{|i|
60
- i.length > 5
61
- }
62
- end
63
-
64
- def self.fetchFromPdfUrl(pdfUrl, work_dir, use_dir)
65
- job_id = makeId
66
- makeDir(job_id, work_dir) if use_dir
67
- file_name = makeFile(job_id, work_dir, use_dir)
68
-
69
- fetchPdfFile(pdfUrl, file_name)
70
- executed_pdf = convertSingleColPdf(job_id, work_dir, file_name, use_dir)
71
- references = fetchReference(executed_pdf)
72
- if use_dir
73
- removeDir(job_id, work_dir)
74
- else
75
- removeFile(job_id, work_dir)
76
- end
77
- return references
78
- end
79
-
80
- end