arxiv-references 0.1.5.1 → 0.1.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/arxiv/references/myUtil.rb +39 -31
- data/lib/arxiv/references/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f0a36d9c1f438e17231af3e1f12584893253e287
|
4
|
+
data.tar.gz: 68d7700f4a942e532c0639584a3c391feb2c0122
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a56c94bce408510688871319d74f7524e5e17c3a51fe7053ae6256fb2b0ac1c4e94890cc571ca153240e049b9f3764d3388ea78dcdb523a30af390b87942edab
|
7
|
+
data.tar.gz: fd90127fea1ca2368134aaedc5bce4ad80d50934c594afe9a6e34b1dc861cfd8bd34173b14a9fa8b19d7f331e0e2a5dbdb4557e2dfb86dfbed8794ffb5038ac5
|
@@ -23,16 +23,24 @@ module ArxivUtil
|
|
23
23
|
FileUtils.rm_rf("#{work_dir}/#{id}")
|
24
24
|
end
|
25
25
|
|
26
|
-
def self.makeFile(id, work_dir)
|
27
|
-
|
26
|
+
def self.makeFile(id, work_dir, use_dir)
|
27
|
+
if use_dir
|
28
|
+
return "#{work_dir}/#{id}/output.pdf"
|
29
|
+
else
|
30
|
+
return "#{work_dir}-#{id}-output.pdf"
|
31
|
+
end
|
28
32
|
end
|
29
33
|
|
30
|
-
def self. getK2Pdf(id, work_dir)
|
31
|
-
|
34
|
+
def self. getK2Pdf(id, work_dir, use_dir)
|
35
|
+
if use_dir
|
36
|
+
return "#{work_dir}/#{id}/output_k2opt.pdf"
|
37
|
+
else
|
38
|
+
return "#{work_dir}-#{id}-output_k2opt.pdf"
|
39
|
+
end
|
32
40
|
end
|
33
41
|
|
34
42
|
|
35
|
-
def self.fetchFromUrl(urlName, work_dir)
|
43
|
+
def self.fetchFromUrl(urlName, work_dir, use_dir)
|
36
44
|
puts "fetch => #{urlName}"
|
37
45
|
charset = nil
|
38
46
|
html = open(urlName) do |f|
|
@@ -46,7 +54,7 @@ module ArxivUtil
|
|
46
54
|
result[:authors] = page.xpath('//*[@id="abs"]/div[2]/div[2]/a').text
|
47
55
|
result[:abstruct] = page.xpath('//*[@id="abs"]/div[2]/blockquote').text
|
48
56
|
result[:pdfurl] = "#{BASE_URL}#{page.xpath('//*[@id="abs"]/div[1]/div[1]/ul/li[1]/a').attr('href').value}"
|
49
|
-
result[:references] = fetchFromPdfUrl(result[:pdfurl], work_dir)
|
57
|
+
result[:references] = fetchFromPdfUrl(result[:pdfurl], work_dir, use_dir)
|
50
58
|
return result.to_json
|
51
59
|
end
|
52
60
|
|
@@ -63,7 +71,7 @@ module ArxivUtil
|
|
63
71
|
end
|
64
72
|
end
|
65
73
|
|
66
|
-
def self.convertSingleColPdf(job_id, work_dir,file_name)
|
74
|
+
def self.convertSingleColPdf(job_id, work_dir,file_name, use_dir)
|
67
75
|
cmd = "k2pdfopt -dev kpw #{file_name}"
|
68
76
|
PTY.spawn(cmd) do |i,o|
|
69
77
|
o.sync = true
|
@@ -77,7 +85,7 @@ module ArxivUtil
|
|
77
85
|
break unless res.index('written').nil?
|
78
86
|
end
|
79
87
|
end
|
80
|
-
return getK2Pdf(job_id, work_dir)
|
88
|
+
return getK2Pdf(job_id, work_dir, use_dir)
|
81
89
|
end
|
82
90
|
|
83
91
|
def self.fetchReference(file_name)
|
@@ -90,35 +98,35 @@ module ArxivUtil
|
|
90
98
|
map(&:number).
|
91
99
|
sort.
|
92
100
|
shift
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
101
|
+
puts "Detect References page=> #{page_no} "
|
102
|
+
ref_page = reader.
|
103
|
+
pages.
|
104
|
+
select{|i|
|
105
|
+
i.number >= page_no
|
106
|
+
}.
|
107
|
+
map{|i|
|
108
|
+
i.text.gsub(/\n+/,"\n").gsub(/ +/,' ')
|
109
|
+
}.
|
110
|
+
join(' ').
|
111
|
+
gsub(REFERENCE_REGEXP,"\n\\1").
|
112
|
+
gsub('- ','').
|
113
|
+
split("\n")
|
106
114
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
115
|
+
return ref_page[(ref_page.index{|i| i =~ REFERENCE_START_REGEXP}+1)..ref_page.length].
|
116
|
+
select{|i|
|
117
|
+
i.length > 5
|
118
|
+
}
|
111
119
|
end
|
112
120
|
|
113
|
-
def self.fetchFromPdfUrl(pdfUrl, work_dir)
|
121
|
+
def self.fetchFromPdfUrl(pdfUrl, work_dir, use_dir)
|
114
122
|
job_id = makeId
|
115
|
-
makeDir(job_id, work_dir)
|
116
|
-
file_name = makeFile(job_id, work_dir)
|
117
|
-
|
123
|
+
makeDir(job_id, work_dir) unless use_dir
|
124
|
+
file_name = makeFile(job_id, work_dir, use_dir)
|
125
|
+
|
118
126
|
fetchPdfFile(pdfUrl, file_name)
|
119
|
-
executed_pdf = convertSingleColPdf(job_id, work_dir, file_name)
|
127
|
+
executed_pdf = convertSingleColPdf(job_id, work_dir, file_name, use_dir)
|
120
128
|
references = fetchReference(executed_pdf)
|
121
|
-
removeDir(job_id, work_dir)
|
129
|
+
removeDir(job_id, work_dir) unless use_dir
|
122
130
|
return references
|
123
131
|
end
|
124
132
|
end
|