extcite 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -1
- data/CHANGELOG.md +9 -0
- data/Gemfile.lock +13 -11
- data/README.md +1 -1
- data/doc/Array.html +205 -0
- data/doc/Configuration.html +296 -0
- data/doc/Extcite.html +1122 -0
- data/doc/Hash.html +380 -0
- data/doc/String.html +289 -0
- data/doc/Textminer.html +601 -0
- data/doc/Textminer/Fetch.html +447 -0
- data/doc/Textminer/Mined.html +509 -0
- data/doc/Textminer/Miner.html +385 -0
- data/doc/Textminer/Request.html +669 -0
- data/doc/Textminer/Response.html +923 -0
- data/doc/_index.html +135 -0
- data/doc/class_list.html +51 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +58 -0
- data/doc/css/style.css +496 -0
- data/doc/file.README.html +139 -0
- data/doc/file_list.html +56 -0
- data/doc/frames.html +17 -0
- data/doc/index.html +139 -0
- data/doc/js/app.js +314 -0
- data/doc/js/full_list.js +216 -0
- data/doc/js/jquery.js +4 -0
- data/doc/method_list.html +155 -0
- data/doc/top-level-namespace.html +397 -0
- data/extcite.gemspec +14 -4
- data/lib/extcite.rb +39 -24
- data/lib/extcite/version.rb +1 -1
- metadata +65 -9
data/doc/Extcite.html
ADDED
@@ -0,0 +1,1122 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta charset="utf-8">
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6
|
+
<title>
|
7
|
+
Module: Extcite
|
8
|
+
|
9
|
+
— Documentation by YARD 0.9.24
|
10
|
+
|
11
|
+
</title>
|
12
|
+
|
13
|
+
<link rel="stylesheet" href="css/style.css" type="text/css" />
|
14
|
+
|
15
|
+
<link rel="stylesheet" href="css/common.css" type="text/css" />
|
16
|
+
|
17
|
+
<script type="text/javascript">
|
18
|
+
pathId = "Extcite";
|
19
|
+
relpath = '';
|
20
|
+
</script>
|
21
|
+
|
22
|
+
|
23
|
+
<script type="text/javascript" charset="utf-8" src="js/jquery.js"></script>
|
24
|
+
|
25
|
+
<script type="text/javascript" charset="utf-8" src="js/app.js"></script>
|
26
|
+
|
27
|
+
|
28
|
+
</head>
|
29
|
+
<body>
|
30
|
+
<div class="nav_wrap">
|
31
|
+
<iframe id="nav" src="class_list.html?1"></iframe>
|
32
|
+
<div id="resizer"></div>
|
33
|
+
</div>
|
34
|
+
|
35
|
+
<div id="main" tabindex="-1">
|
36
|
+
<div id="header">
|
37
|
+
<div id="menu">
|
38
|
+
|
39
|
+
<a href="_index.html">Index (E)</a> »
|
40
|
+
|
41
|
+
|
42
|
+
<span class="title">Extcite</span>
|
43
|
+
|
44
|
+
</div>
|
45
|
+
|
46
|
+
<div id="search">
|
47
|
+
|
48
|
+
<a class="full_list_link" id="class_list_link"
|
49
|
+
href="class_list.html">
|
50
|
+
|
51
|
+
<svg width="24" height="24">
|
52
|
+
<rect x="0" y="4" width="24" height="4" rx="1" ry="1"></rect>
|
53
|
+
<rect x="0" y="12" width="24" height="4" rx="1" ry="1"></rect>
|
54
|
+
<rect x="0" y="20" width="24" height="4" rx="1" ry="1"></rect>
|
55
|
+
</svg>
|
56
|
+
</a>
|
57
|
+
|
58
|
+
</div>
|
59
|
+
<div class="clear"></div>
|
60
|
+
</div>
|
61
|
+
|
62
|
+
<div id="content"><h1>Module: Extcite
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
</h1>
|
67
|
+
<div class="box_info">
|
68
|
+
|
69
|
+
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
<dl>
|
80
|
+
<dt>Defined in:</dt>
|
81
|
+
<dd>lib/extcite.rb<span class="defines">,<br />
|
82
|
+
lib/extcite/version.rb</span>
|
83
|
+
</dd>
|
84
|
+
</dl>
|
85
|
+
|
86
|
+
</div>
|
87
|
+
|
88
|
+
|
89
|
+
|
90
|
+
<h2>
|
91
|
+
Constant Summary
|
92
|
+
<small><a href="#" class="constants_summary_toggle">collapse</a></small>
|
93
|
+
</h2>
|
94
|
+
|
95
|
+
<dl class="constants">
|
96
|
+
|
97
|
+
<dt id="VERSION-constant" class="">VERSION =
|
98
|
+
|
99
|
+
</dt>
|
100
|
+
<dd><pre class="code"><span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>0.4.0</span><span class='tstring_end'>"</span></span></pre></dd>
|
101
|
+
|
102
|
+
</dl>
|
103
|
+
|
104
|
+
|
105
|
+
|
106
|
+
|
107
|
+
|
108
|
+
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
<h2>
|
113
|
+
Class Method Summary
|
114
|
+
<small><a href="#" class="summary_toggle">collapse</a></small>
|
115
|
+
</h2>
|
116
|
+
|
117
|
+
<ul class="summary">
|
118
|
+
|
119
|
+
<li class="public ">
|
120
|
+
<span class="summary_signature">
|
121
|
+
|
122
|
+
<a href="#cont_neg-class_method" title="cont_neg (class method)">.<strong>cont_neg</strong>(ids:) ⇒ Object </a>
|
123
|
+
|
124
|
+
|
125
|
+
|
126
|
+
</span>
|
127
|
+
|
128
|
+
|
129
|
+
|
130
|
+
|
131
|
+
|
132
|
+
|
133
|
+
|
134
|
+
|
135
|
+
|
136
|
+
<span class="summary_desc"><div class='inline'>
|
137
|
+
<p>Get citation(s) using Crossref content negotation.</p>
|
138
|
+
</div></span>
|
139
|
+
|
140
|
+
</li>
|
141
|
+
|
142
|
+
|
143
|
+
<li class="public ">
|
144
|
+
<span class="summary_signature">
|
145
|
+
|
146
|
+
<a href="#extract-class_method" title="extract (class method)">.<strong>extract</strong>(path:, file: "out.bib", output: "bib") ⇒ Object </a>
|
147
|
+
|
148
|
+
|
149
|
+
|
150
|
+
</span>
|
151
|
+
|
152
|
+
|
153
|
+
|
154
|
+
|
155
|
+
|
156
|
+
|
157
|
+
|
158
|
+
|
159
|
+
|
160
|
+
<span class="summary_desc"><div class='inline'>
|
161
|
+
<p>Extract DOIs from one or more PDFs.</p>
|
162
|
+
</div></span>
|
163
|
+
|
164
|
+
</li>
|
165
|
+
|
166
|
+
|
167
|
+
<li class="public ">
|
168
|
+
<span class="summary_signature">
|
169
|
+
|
170
|
+
<a href="#extract_dois-class_method" title="extract_dois (class method)">.<strong>extract_dois</strong>(path:) ⇒ Object </a>
|
171
|
+
|
172
|
+
|
173
|
+
|
174
|
+
</span>
|
175
|
+
|
176
|
+
|
177
|
+
|
178
|
+
|
179
|
+
|
180
|
+
|
181
|
+
|
182
|
+
|
183
|
+
|
184
|
+
<span class="summary_desc"><div class='inline'>
|
185
|
+
<p>Extract DOIs from one or more PDFs after extracting text.</p>
|
186
|
+
</div></span>
|
187
|
+
|
188
|
+
</li>
|
189
|
+
|
190
|
+
|
191
|
+
<li class="public ">
|
192
|
+
<span class="summary_signature">
|
193
|
+
|
194
|
+
<a href="#extract_from_metadata-class_method" title="extract_from_metadata (class method)">.<strong>extract_from_metadata</strong>(path:) ⇒ Object </a>
|
195
|
+
|
196
|
+
|
197
|
+
|
198
|
+
</span>
|
199
|
+
|
200
|
+
|
201
|
+
|
202
|
+
|
203
|
+
|
204
|
+
|
205
|
+
|
206
|
+
|
207
|
+
|
208
|
+
<span class="summary_desc"><div class='inline'>
|
209
|
+
<p>Try to extract DOIs from one or more PDF metadata sections.</p>
|
210
|
+
</div></span>
|
211
|
+
|
212
|
+
</li>
|
213
|
+
|
214
|
+
|
215
|
+
<li class="public ">
|
216
|
+
<span class="summary_signature">
|
217
|
+
|
218
|
+
<a href="#extract_text-class_method" title="extract_text (class method)">.<strong>extract_text</strong>(path:) ⇒ Object </a>
|
219
|
+
|
220
|
+
|
221
|
+
|
222
|
+
</span>
|
223
|
+
|
224
|
+
|
225
|
+
|
226
|
+
|
227
|
+
|
228
|
+
|
229
|
+
|
230
|
+
|
231
|
+
|
232
|
+
<span class="summary_desc"><div class='inline'>
|
233
|
+
<p>Extract text from a pdf, or many pdfs.</p>
|
234
|
+
</div></span>
|
235
|
+
|
236
|
+
</li>
|
237
|
+
|
238
|
+
|
239
|
+
<li class="public ">
|
240
|
+
<span class="summary_signature">
|
241
|
+
|
242
|
+
<a href="#get_ids-class_method" title="get_ids (class method)">.<strong>get_ids</strong>(txt:) ⇒ Object </a>
|
243
|
+
|
244
|
+
|
245
|
+
|
246
|
+
</span>
|
247
|
+
|
248
|
+
|
249
|
+
|
250
|
+
|
251
|
+
|
252
|
+
|
253
|
+
|
254
|
+
|
255
|
+
|
256
|
+
<span class="summary_desc"><div class='inline'>
|
257
|
+
<p>Get DOIs from a String or Array of String's.</p>
|
258
|
+
</div></span>
|
259
|
+
|
260
|
+
</li>
|
261
|
+
|
262
|
+
|
263
|
+
</ul>
|
264
|
+
|
265
|
+
|
266
|
+
|
267
|
+
|
268
|
+
<div id="class_method_details" class="method_details_list">
|
269
|
+
<h2>Class Method Details</h2>
|
270
|
+
|
271
|
+
|
272
|
+
<div class="method_details first">
|
273
|
+
<h3 class="signature first" id="cont_neg-class_method">
|
274
|
+
|
275
|
+
.<strong>cont_neg</strong>(ids:) ⇒ <tt>Object</tt>
|
276
|
+
|
277
|
+
|
278
|
+
|
279
|
+
|
280
|
+
|
281
|
+
</h3><div class="docstring">
|
282
|
+
<div class="discussion">
|
283
|
+
|
284
|
+
<p>Get citation(s) using Crossref content negotation</p>
|
285
|
+
|
286
|
+
<p>Return: an string of bib data</p>
|
287
|
+
|
288
|
+
|
289
|
+
</div>
|
290
|
+
</div>
|
291
|
+
<div class="tags">
|
292
|
+
|
293
|
+
<div class="examples">
|
294
|
+
<p class="tag_title">Examples:</p>
|
295
|
+
|
296
|
+
|
297
|
+
<pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>extcite</span><span class='tstring_end'>'</span></span>
|
298
|
+
<span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_cont_neg'>cont_neg</span><span class='lparen'>(</span><span class='label'>ids:</span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>10.1016/j.dendro.2014.01.004</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span></code></pre>
|
299
|
+
|
300
|
+
</div>
|
301
|
+
<p class="tag_title">Parameters:</p>
|
302
|
+
<ul class="param">
|
303
|
+
|
304
|
+
<li>
|
305
|
+
|
306
|
+
<span class='name'>ids</span>
|
307
|
+
|
308
|
+
|
309
|
+
<span class='type'>(<tt><span class='object_link'><a href="Array.html" title="Array (class)">Array</a></span>[<span class='object_link'><a href="String.html" title="String (class)">String</a></span>]</tt>)</span>
|
310
|
+
|
311
|
+
|
312
|
+
|
313
|
+
—
|
314
|
+
<div class='inline'>
|
315
|
+
<p>One or more DOIs in an array</p>
|
316
|
+
</div>
|
317
|
+
|
318
|
+
</li>
|
319
|
+
|
320
|
+
</ul>
|
321
|
+
|
322
|
+
|
323
|
+
</div><table class="source_code">
|
324
|
+
<tr>
|
325
|
+
<td>
|
326
|
+
<pre class="lines">
|
327
|
+
|
328
|
+
|
329
|
+
296
|
330
|
+
297
|
331
|
+
298
|
332
|
+
299</pre>
|
333
|
+
</td>
|
334
|
+
<td>
|
335
|
+
<pre class="code"><span class="info file"># File 'lib/extcite.rb', line 296</span>
|
336
|
+
|
337
|
+
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_cont_neg'>cont_neg</span><span class='lparen'>(</span><span class='label'>ids:</span><span class='rparen'>)</span>
|
338
|
+
<span class='id identifier rubyid_out'>out</span> <span class='op'>=</span> <span class='const'>Serrano</span><span class='period'>.</span><span class='id identifier rubyid_content_negotiation'>content_negotiation</span><span class='lparen'>(</span><span class='label'>ids:</span> <span class='id identifier rubyid_ids'>ids</span><span class='rparen'>)</span>
|
339
|
+
<span class='kw'>return</span> <span class='id identifier rubyid_out'>out</span>
|
340
|
+
<span class='kw'>end</span></pre>
|
341
|
+
</td>
|
342
|
+
</tr>
|
343
|
+
</table>
|
344
|
+
</div>
|
345
|
+
|
346
|
+
<div class="method_details ">
|
347
|
+
<h3 class="signature " id="extract-class_method">
|
348
|
+
|
349
|
+
.<strong>extract</strong>(path:, file: "out.bib", output: "bib") ⇒ <tt>Object</tt>
|
350
|
+
|
351
|
+
|
352
|
+
|
353
|
+
|
354
|
+
|
355
|
+
</h3><div class="docstring">
|
356
|
+
<div class="discussion">
|
357
|
+
|
358
|
+
<p>Extract DOIs from one or more PDFs</p>
|
359
|
+
|
360
|
+
<p>Return: writes bib files to a .bib file or an array if file is nil</p>
|
361
|
+
|
362
|
+
<pre class="code ruby"><code class="ruby">When writing to a file, `extract` by default appends to the end
|
363
|
+
of the file so you can build up your bibtex file with your
|
364
|
+
citations
|
365
|
+
</code></pre>
|
366
|
+
|
367
|
+
|
368
|
+
</div>
|
369
|
+
</div>
|
370
|
+
<div class="tags">
|
371
|
+
|
372
|
+
<div class="examples">
|
373
|
+
<p class="tag_title">Examples:</p>
|
374
|
+
|
375
|
+
|
376
|
+
<pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>extcite</span><span class='tstring_end'>'</span></span>
|
377
|
+
<span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>faraday</span><span class='tstring_end'>'</span></span><span class='comment'># get a paper in pdf format
|
378
|
+
</span>
|
379
|
+
<span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>2068.pdf</span><span class='tstring_end'>'</span></span>
|
380
|
+
<span class='id identifier rubyid_res'>res</span> <span class='op'>=</span> <span class='const'>Faraday</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='symbol'>:url</span> <span class='op'>=></span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>https://peerj.com/articles/2068.pdf</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_get'>get</span><span class='semicolon'>;</span>
|
381
|
+
<span class='id identifier rubyid_f'>f</span> <span class='op'>=</span> <span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>wb</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span><span class='semicolon'>;</span>
|
382
|
+
<span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_write'>write</span><span class='lparen'>(</span><span class='id identifier rubyid_res'>res</span><span class='period'>.</span><span class='id identifier rubyid_body'>body</span><span class='rparen'>)</span><span class='semicolon'>;</span>
|
383
|
+
<span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_close'>close</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='comment'># extract doi from the pdf
|
384
|
+
</span>
|
385
|
+
<span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract'>extract</span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span>
|
386
|
+
<span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract'>extract</span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='comma'>,</span> <span class='label'>file:</span> <span class='kw'>nil</span><span class='rparen'>)</span></code></pre>
|
387
|
+
|
388
|
+
</div>
|
389
|
+
<p class="tag_title">Parameters:</p>
|
390
|
+
<ul class="param">
|
391
|
+
|
392
|
+
<li>
|
393
|
+
|
394
|
+
<span class='name'>path</span>
|
395
|
+
|
396
|
+
|
397
|
+
<span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
|
398
|
+
|
399
|
+
|
400
|
+
|
401
|
+
—
|
402
|
+
<div class='inline'>
|
403
|
+
<p>Path to a pdf file, or a folder of PDF files</p>
|
404
|
+
</div>
|
405
|
+
|
406
|
+
</li>
|
407
|
+
|
408
|
+
<li>
|
409
|
+
|
410
|
+
<span class='name'>file</span>
|
411
|
+
|
412
|
+
|
413
|
+
<span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
|
414
|
+
|
415
|
+
|
416
|
+
<em class="default">(defaults to: <tt>"out.bib"</tt>)</em>
|
417
|
+
|
418
|
+
|
419
|
+
—
|
420
|
+
<div class='inline'>
|
421
|
+
<p>File name to write data to - or nil to stdout</p>
|
422
|
+
</div>
|
423
|
+
|
424
|
+
</li>
|
425
|
+
|
426
|
+
<li>
|
427
|
+
|
428
|
+
<span class='name'>output</span>
|
429
|
+
|
430
|
+
|
431
|
+
<span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
|
432
|
+
|
433
|
+
|
434
|
+
<em class="default">(defaults to: <tt>"bib"</tt>)</em>
|
435
|
+
|
436
|
+
|
437
|
+
—
|
438
|
+
<div class='inline'>
|
439
|
+
<p>Typeo of output. only bibtex for now</p>
|
440
|
+
</div>
|
441
|
+
|
442
|
+
</li>
|
443
|
+
|
444
|
+
</ul>
|
445
|
+
|
446
|
+
|
447
|
+
</div><table class="source_code">
|
448
|
+
<tr>
|
449
|
+
<td>
|
450
|
+
<pre class="lines">
|
451
|
+
|
452
|
+
|
453
|
+
38
|
454
|
+
39
|
455
|
+
40
|
456
|
+
41
|
457
|
+
42
|
458
|
+
43
|
459
|
+
44
|
460
|
+
45
|
461
|
+
46
|
462
|
+
47
|
463
|
+
48
|
464
|
+
49
|
465
|
+
50
|
466
|
+
51
|
467
|
+
52
|
468
|
+
53
|
469
|
+
54
|
470
|
+
55
|
471
|
+
56
|
472
|
+
57
|
473
|
+
58
|
474
|
+
59
|
475
|
+
60
|
476
|
+
61
|
477
|
+
62
|
478
|
+
63
|
479
|
+
64
|
480
|
+
65
|
481
|
+
66
|
482
|
+
67
|
483
|
+
68
|
484
|
+
69
|
485
|
+
70
|
486
|
+
71
|
487
|
+
72
|
488
|
+
73
|
489
|
+
74
|
490
|
+
75
|
491
|
+
76
|
492
|
+
77
|
493
|
+
78
|
494
|
+
79
|
495
|
+
80
|
496
|
+
81
|
497
|
+
82
|
498
|
+
83
|
499
|
+
84
|
500
|
+
85
|
501
|
+
86
|
502
|
+
87
|
503
|
+
88
|
504
|
+
89
|
505
|
+
90
|
506
|
+
91
|
507
|
+
92
|
508
|
+
93
|
509
|
+
94
|
510
|
+
95
|
511
|
+
96
|
512
|
+
97
|
513
|
+
98
|
514
|
+
99
|
515
|
+
100
|
516
|
+
101
|
517
|
+
102
|
518
|
+
103
|
519
|
+
104
|
520
|
+
105
|
521
|
+
106
|
522
|
+
107
|
523
|
+
108
|
524
|
+
109
|
525
|
+
110
|
526
|
+
111
|
527
|
+
112
|
528
|
+
113
|
529
|
+
114
|
530
|
+
115
|
531
|
+
116
|
532
|
+
117</pre>
|
533
|
+
</td>
|
534
|
+
<td>
|
535
|
+
<pre class="code"><span class="info file"># File 'lib/extcite.rb', line 38</span>
|
536
|
+
|
537
|
+
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_extract'>extract</span><span class='lparen'>(</span><span class='label'>path:</span><span class='comma'>,</span> <span class='label'>file:</span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>out.bib</span><span class='tstring_end'>"</span></span><span class='comma'>,</span> <span class='label'>output:</span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>bib</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span>
|
538
|
+
<span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='id identifier rubyid_make_paths'><span class='object_link'><a href="top-level-namespace.html#make_paths-instance_method" title="#make_paths (method)">make_paths</a></span></span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span>
|
539
|
+
<span class='id identifier rubyid_path'>path</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_x'>x</span><span class='op'>|</span>
|
540
|
+
<span class='comment'># try PDF metadata first
|
541
|
+
</span> <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
|
542
|
+
<span class='id identifier rubyid_rr'>rr</span> <span class='op'>=</span> <span class='const'>PDF</span><span class='op'>::</span><span class='const'>Reader</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_x'>x</span><span class='rparen'>)</span><span class='semicolon'>;</span>
|
543
|
+
<span class='id identifier rubyid_pdfmeta'>pdfmeta</span> <span class='op'>=</span> <span class='id identifier rubyid_rr'>rr</span><span class='period'>.</span><span class='id identifier rubyid_metadata'>metadata</span>
|
544
|
+
<span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_pdfmeta'>pdfmeta</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
545
|
+
<span class='id identifier rubyid_xml'>xml</span> <span class='op'>=</span> <span class='const'>Oga</span><span class='period'>.</span><span class='id identifier rubyid_parse_xml'>parse_xml</span><span class='lparen'>(</span><span class='id identifier rubyid_pdfmeta'>pdfmeta</span><span class='rparen'>)</span><span class='semicolon'>;</span>
|
546
|
+
<span class='kw'>begin</span>
|
547
|
+
<span class='id identifier rubyid_tt'>tt</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>//rdf:Description</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span> <span class='comment'># try dc:identifier attribute
|
548
|
+
</span>
|
549
|
+
<span class='id identifier rubyid_ss'>ss</span> <span class='op'>=</span> <span class='id identifier rubyid_tt'>tt</span><span class='period'>.</span><span class='id identifier rubyid_attr'>attr</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>dc:identifier</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span>
|
550
|
+
<span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_ss'>ss</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
551
|
+
<span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_ss'>ss</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span><span class='period'>.</span><span class='id identifier rubyid_sub'>sub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>doi:</span><span class='regexp_end'>/</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span>
|
552
|
+
<span class='kw'>else</span>
|
553
|
+
<span class='comment'># try prism:doi node
|
554
|
+
</span> <span class='id identifier rubyid_pdoi'>pdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>//rdf:Description//prism:doi</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span>
|
555
|
+
<span class='kw'>if</span> <span class='id identifier rubyid_pdoi'>pdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
|
556
|
+
<span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_pdoi'>pdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
|
557
|
+
<span class='kw'>else</span>
|
558
|
+
<span class='comment'># try pdf:WPS-ARTICLEDOI node
|
559
|
+
</span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>//rdf:Description//pdf:WPS-ARTICLEDOI</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span>
|
560
|
+
<span class='kw'>if</span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
|
561
|
+
<span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
|
562
|
+
<span class='kw'>else</span>
|
563
|
+
<span class='comment'># try pdfx:WPS-ARTICLEDOI node
|
564
|
+
</span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>//rdf:Description//pdfx:WPS-ARTICLEDOI</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span>
|
565
|
+
<span class='kw'>if</span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
|
566
|
+
<span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
|
567
|
+
<span class='kw'>else</span>
|
568
|
+
<span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
|
569
|
+
<span class='kw'>end</span>
|
570
|
+
<span class='kw'>end</span>
|
571
|
+
<span class='kw'>end</span>
|
572
|
+
<span class='kw'>end</span>
|
573
|
+
<span class='kw'>rescue</span>
|
574
|
+
<span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
|
575
|
+
<span class='kw'>end</span>
|
576
|
+
<span class='kw'>end</span>
|
577
|
+
|
578
|
+
<span class='comment'># if not found, try regexing for DOI
|
579
|
+
</span> <span class='kw'>if</span> <span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
580
|
+
<span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_get_ids'><span class='object_link'><a href="#get_ids-class_method" title="Extcite.get_ids (method)">get_ids</a></span></span><span class='lparen'>(</span><span class='label'>txt:</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_text_one'>extract_text_one</span><span class='lparen'>(</span><span class='id identifier rubyid_x'>x</span><span class='rparen'>)</span><span class='rparen'>)</span>
|
581
|
+
<span class='kw'>end</span>
|
582
|
+
|
583
|
+
<span class='kw'>if</span> <span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>0</span>
|
584
|
+
<span class='id identifier rubyid_puts'>puts</span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>no DOI found in </span><span class='tstring_end'>"</span></span> <span class='op'>+</span> <span class='id identifier rubyid_x'>x</span>
|
585
|
+
<span class='kw'>else</span>
|
586
|
+
<span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>arxiv</span><span class='regexp_end'>/i</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>&&</span> <span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'><</span> <span class='int'>200</span>
|
587
|
+
<span class='id identifier rubyid_conn'>conn</span> <span class='op'>=</span> <span class='const'>Faraday</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='symbol'>:url</span> <span class='op'>=></span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>http://export.arxiv.org/api/query?id_list=</span><span class='tstring_end'>'</span></span> <span class='op'>+</span> <span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>arxiv:</span><span class='regexp_end'>/i</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_get'>get</span>
|
588
|
+
<span class='id identifier rubyid_bibs'>bibs</span> <span class='op'>=</span> <span class='id identifier rubyid_conn'>conn</span><span class='period'>.</span><span class='id identifier rubyid_body'>body</span><span class='period'>.</span><span class='id identifier rubyid_make_bib_arxiv'>make_bib_arxiv</span><span class='lparen'>(</span><span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>arxiv:</span><span class='regexp_end'>/i</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span><span class='rparen'>)</span>
|
589
|
+
<span class='kw'>else</span>
|
590
|
+
<span class='id identifier rubyid_bibs'>bibs</span> <span class='op'>=</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_cont_neg'><span class='object_link'><a href="#cont_neg-class_method" title="Extcite.cont_neg (method)">cont_neg</a></span></span><span class='lparen'>(</span><span class='label'>ids:</span> <span class='id identifier rubyid_ids'>ids</span><span class='rparen'>)</span>
|
591
|
+
<span class='kw'>end</span>
|
592
|
+
|
593
|
+
<span class='comment'># if an error or not found, skip
|
594
|
+
</span> <span class='id identifier rubyid_bibstest'>bibstest</span> <span class='op'>=</span> <span class='kw'>nil</span>
|
595
|
+
<span class='kw'>if</span> <span class='id identifier rubyid_bibs'>bibs</span><span class='period'>.</span><span class='id identifier rubyid_class'>class</span> <span class='op'>==</span> <span class='const'><span class='object_link'><a href="Array.html" title="Array (class)">Array</a></span></span>
|
596
|
+
<span class='id identifier rubyid_bibstest'>bibstest</span> <span class='op'>=</span> <span class='id identifier rubyid_bibs'>bibs</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span>
|
597
|
+
<span class='kw'>else</span>
|
598
|
+
<span class='id identifier rubyid_bibstest'>bibstest</span> <span class='op'>=</span> <span class='id identifier rubyid_bibs'>bibs</span>
|
599
|
+
<span class='kw'>end</span>
|
600
|
+
|
601
|
+
<span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_bibstest'>bibstest</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
602
|
+
<span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_bibstest'>bibstest</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>error|not found</span><span class='regexp_end'>/i</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>||</span> <span class='op'>!</span><span class='id identifier rubyid_bibstest'>bibstest</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'><\/html></span><span class='regexp_end'>/i</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
603
|
+
<span class='id identifier rubyid_puts'>puts</span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>DOI found: </span><span class='tstring_end'>"</span></span> <span class='op'>+</span> <span class='id identifier rubyid_ids'>ids</span> <span class='op'>+</span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'> ; but citation not found via content negotation - passing</span><span class='tstring_end'>"</span></span> <span class='comment'># do something else?
|
604
|
+
</span>
|
605
|
+
<span class='kw'>else</span>
|
606
|
+
<span class='kw'>if</span> <span class='id identifier rubyid_file'>file</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
607
|
+
<span class='kw'>return</span> <span class='id identifier rubyid_bibstest'>bibstest</span>
|
608
|
+
<span class='kw'>else</span>
|
609
|
+
<span class='id identifier rubyid_puts'>puts</span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>writing </span><span class='tstring_end'>"</span></span> <span class='op'>+</span> <span class='id identifier rubyid_ids'>ids</span> <span class='op'>+</span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'> to </span><span class='tstring_end'>"</span></span> <span class='op'>+</span> <span class='id identifier rubyid_file'>file</span>
|
610
|
+
<span class='id identifier rubyid_bibs'>bibs</span><span class='period'>.</span><span class='id identifier rubyid_write_bib'>write_bib</span><span class='lparen'>(</span><span class='id identifier rubyid_file'>file</span><span class='rparen'>)</span>
|
611
|
+
<span class='kw'>end</span>
|
612
|
+
<span class='kw'>end</span>
|
613
|
+
<span class='kw'>end</span>
|
614
|
+
<span class='kw'>end</span>
|
615
|
+
<span class='kw'>end</span>
|
616
|
+
<span class='kw'>end</span></pre>
|
617
|
+
</td>
|
618
|
+
</tr>
|
619
|
+
</table>
|
620
|
+
</div>
|
621
|
+
|
622
|
+
<div class="method_details ">
|
623
|
+
<h3 class="signature " id="extract_dois-class_method">
|
624
|
+
|
625
|
+
.<strong>extract_dois</strong>(path:) ⇒ <tt>Object</tt>
|
626
|
+
|
627
|
+
|
628
|
+
|
629
|
+
|
630
|
+
|
631
|
+
</h3><div class="docstring">
|
632
|
+
<div class="discussion">
|
633
|
+
|
634
|
+
<p>Extract DOIs from one or more PDFs after extracting text</p>
|
635
|
+
|
636
|
+
|
637
|
+
</div>
|
638
|
+
</div>
|
639
|
+
<div class="tags">
|
640
|
+
|
641
|
+
<div class="examples">
|
642
|
+
<p class="tag_title">Examples:</p>
|
643
|
+
|
644
|
+
|
645
|
+
<pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>extcite</span><span class='tstring_end'>'</span></span>
|
646
|
+
<span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>faraday</span><span class='tstring_end'>'</span></span><span class='comment'># get a paper in pdf format
|
647
|
+
</span>
|
648
|
+
<span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>2068.pdf</span><span class='tstring_end'>'</span></span>
|
649
|
+
<span class='id identifier rubyid_res'>res</span> <span class='op'>=</span> <span class='const'>Faraday</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='symbol'>:url</span> <span class='op'>=></span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>https://peerj.com/articles/2068.pdf</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_get'>get</span>
|
650
|
+
<span class='id identifier rubyid_f'>f</span> <span class='op'>=</span> <span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>wb</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span>
|
651
|
+
<span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_write'>write</span><span class='lparen'>(</span><span class='id identifier rubyid_res'>res</span><span class='period'>.</span><span class='id identifier rubyid_body'>body</span><span class='rparen'>)</span>
|
652
|
+
<span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_close'>close</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='comment'># extract doi from the pdf
|
653
|
+
</span>
|
654
|
+
<span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_dois'>extract_dois</span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span></code></pre>
|
655
|
+
|
656
|
+
</div>
|
657
|
+
<p class="tag_title">Parameters:</p>
|
658
|
+
<ul class="param">
|
659
|
+
|
660
|
+
<li>
|
661
|
+
|
662
|
+
<span class='name'>path</span>
|
663
|
+
|
664
|
+
|
665
|
+
<span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
|
666
|
+
|
667
|
+
|
668
|
+
|
669
|
+
—
|
670
|
+
<div class='inline'>
|
671
|
+
<p>Path to a pdf file, or a folder of PDF files</p>
|
672
|
+
</div>
|
673
|
+
|
674
|
+
</li>
|
675
|
+
|
676
|
+
</ul>
|
677
|
+
|
678
|
+
|
679
|
+
</div><table class="source_code">
|
680
|
+
<tr>
|
681
|
+
<td>
|
682
|
+
<pre class="lines">
|
683
|
+
|
684
|
+
|
685
|
+
210
|
686
|
+
211
|
687
|
+
212
|
688
|
+
213
|
689
|
+
214</pre>
|
690
|
+
</td>
|
691
|
+
<td>
|
692
|
+
<pre class="code"><span class="info file"># File 'lib/extcite.rb', line 210</span>
|
693
|
+
|
694
|
+
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_extract_dois'>extract_dois</span><span class='lparen'>(</span><span class='label'>path:</span><span class='rparen'>)</span>
|
695
|
+
<span class='id identifier rubyid_txt'>txt</span> <span class='op'>=</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_text'><span class='object_link'><a href="#extract_text-class_method" title="Extcite.extract_text (method)">extract_text</a></span></span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span> <span class='comment'># return txt.map { |z| z.match("[0-9]+\\.[0-9]+/.+").to_s.gsub(/\s.+/, '') }
|
696
|
+
</span>
|
697
|
+
<span class='kw'>return</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_get_ids'><span class='object_link'><a href="#get_ids-class_method" title="Extcite.get_ids (method)">get_ids</a></span></span><span class='lparen'>(</span><span class='label'>txt:</span> <span class='id identifier rubyid_txt'>txt</span><span class='rparen'>)</span>
|
698
|
+
<span class='kw'>end</span></pre>
|
699
|
+
</td>
|
700
|
+
</tr>
|
701
|
+
</table>
|
702
|
+
</div>
|
703
|
+
|
704
|
+
<div class="method_details ">
|
705
|
+
<h3 class="signature " id="extract_from_metadata-class_method">
|
706
|
+
|
707
|
+
.<strong>extract_from_metadata</strong>(path:) ⇒ <tt>Object</tt>
|
708
|
+
|
709
|
+
|
710
|
+
|
711
|
+
|
712
|
+
|
713
|
+
</h3><div class="docstring">
|
714
|
+
<div class="discussion">
|
715
|
+
|
716
|
+
<p>Try to extract DOIs from one or more PDF metadata sections</p>
|
717
|
+
|
718
|
+
<p>Return: DOI string</p>
|
719
|
+
|
720
|
+
|
721
|
+
</div>
|
722
|
+
</div>
|
723
|
+
<div class="tags">
|
724
|
+
|
725
|
+
<div class="examples">
|
726
|
+
<p class="tag_title">Examples:</p>
|
727
|
+
|
728
|
+
|
729
|
+
<pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>extcite</span><span class='tstring_end'>'</span></span>
|
730
|
+
<span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>faraday</span><span class='tstring_end'>'</span></span><span class='comment'># get a paper in pdf format
|
731
|
+
</span>
|
732
|
+
<span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>2068.pdf</span><span class='tstring_end'>'</span></span>
|
733
|
+
<span class='id identifier rubyid_res'>res</span> <span class='op'>=</span> <span class='const'>Faraday</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='symbol'>:url</span> <span class='op'>=></span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>https://peerj.com/articles/2068.pdf</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_get'>get</span><span class='semicolon'>;</span>
|
734
|
+
<span class='id identifier rubyid_f'>f</span> <span class='op'>=</span> <span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>wb</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span><span class='semicolon'>;</span>
|
735
|
+
<span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_write'>write</span><span class='lparen'>(</span><span class='id identifier rubyid_res'>res</span><span class='period'>.</span><span class='id identifier rubyid_body'>body</span><span class='rparen'>)</span>
|
736
|
+
<span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_close'>close</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='comment'># extract doi from the pdf
|
737
|
+
</span>
|
738
|
+
<span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_from_metadata'>extract_from_metadata</span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span></code></pre>
|
739
|
+
|
740
|
+
</div>
|
741
|
+
<p class="tag_title">Parameters:</p>
|
742
|
+
<ul class="param">
|
743
|
+
|
744
|
+
<li>
|
745
|
+
|
746
|
+
<span class='name'>path</span>
|
747
|
+
|
748
|
+
|
749
|
+
<span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
|
750
|
+
|
751
|
+
|
752
|
+
|
753
|
+
—
|
754
|
+
<div class='inline'>
|
755
|
+
<p>Path to a pdf file, or a folder of PDF files</p>
|
756
|
+
</div>
|
757
|
+
|
758
|
+
</li>
|
759
|
+
|
760
|
+
</ul>
|
761
|
+
|
762
|
+
|
763
|
+
</div><table class="source_code">
|
764
|
+
<tr>
|
765
|
+
<td>
|
766
|
+
<pre class="lines">
|
767
|
+
|
768
|
+
|
769
|
+
137
|
770
|
+
138
|
771
|
+
139
|
772
|
+
140
|
773
|
+
141
|
774
|
+
142
|
775
|
+
143
|
776
|
+
144
|
777
|
+
145
|
778
|
+
146
|
779
|
+
147
|
780
|
+
148
|
781
|
+
149
|
782
|
+
150
|
783
|
+
151
|
784
|
+
152
|
785
|
+
153
|
786
|
+
154
|
787
|
+
155
|
788
|
+
156
|
789
|
+
157
|
790
|
+
158
|
791
|
+
159
|
792
|
+
160
|
793
|
+
161
|
794
|
+
162
|
795
|
+
163
|
796
|
+
164
|
797
|
+
165
|
798
|
+
166
|
799
|
+
167
|
800
|
+
168
|
801
|
+
169
|
802
|
+
170
|
803
|
+
171
|
804
|
+
172
|
805
|
+
173
|
806
|
+
174
|
807
|
+
175
|
808
|
+
176
|
809
|
+
177
|
810
|
+
178
|
811
|
+
179
|
812
|
+
180
|
813
|
+
181
|
814
|
+
182
|
815
|
+
183
|
816
|
+
184
|
817
|
+
185
|
818
|
+
186
|
819
|
+
187
|
820
|
+
188
|
821
|
+
189
|
822
|
+
190
|
823
|
+
191
|
824
|
+
192</pre>
|
825
|
+
</td>
|
826
|
+
<td>
|
827
|
+
<pre class="code"><span class="info file"># File 'lib/extcite.rb', line 137</span>
|
828
|
+
|
829
|
+
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_extract_from_metadata'>extract_from_metadata</span><span class='lparen'>(</span><span class='label'>path:</span><span class='rparen'>)</span>
|
830
|
+
<span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='id identifier rubyid_make_paths'><span class='object_link'><a href="top-level-namespace.html#make_paths-instance_method" title="#make_paths (method)">make_paths</a></span></span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span>
|
831
|
+
<span class='id identifier rubyid_path'>path</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_x'>x</span><span class='op'>|</span>
|
832
|
+
<span class='comment'># try PDF metadata first
|
833
|
+
</span> <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
|
834
|
+
<span class='id identifier rubyid_rr'>rr</span> <span class='op'>=</span> <span class='const'>PDF</span><span class='op'>::</span><span class='const'>Reader</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_x'>x</span><span class='rparen'>)</span>
|
835
|
+
<span class='id identifier rubyid_pdfmeta'>pdfmeta</span> <span class='op'>=</span> <span class='id identifier rubyid_rr'>rr</span><span class='period'>.</span><span class='id identifier rubyid_metadata'>metadata</span>
|
836
|
+
<span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_pdfmeta'>pdfmeta</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
837
|
+
<span class='kw'>begin</span>
|
838
|
+
<span class='id identifier rubyid_xml'>xml</span> <span class='op'>=</span> <span class='const'>Oga</span><span class='period'>.</span><span class='id identifier rubyid_parse_xml'>parse_xml</span><span class='lparen'>(</span><span class='id identifier rubyid_pdfmeta'>pdfmeta</span><span class='rparen'>)</span><span class='semicolon'>;</span>
|
839
|
+
<span class='kw'>rescue</span> <span class='const'>Exception</span> <span class='op'>=></span> <span class='id identifier rubyid_e'>e</span>
|
840
|
+
<span class='id identifier rubyid_xml'>xml</span> <span class='op'>=</span> <span class='kw'>nil</span>
|
841
|
+
<span class='kw'>end</span>
|
842
|
+
|
843
|
+
<span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
844
|
+
<span class='kw'>begin</span>
|
845
|
+
<span class='id identifier rubyid_tt'>tt</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>//rdf:Description</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span> <span class='comment'># try dc:identifier attribute
|
846
|
+
</span>
|
847
|
+
<span class='id identifier rubyid_ss'>ss</span> <span class='op'>=</span> <span class='id identifier rubyid_tt'>tt</span><span class='period'>.</span><span class='id identifier rubyid_attr'>attr</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>dc:identifier</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span>
|
848
|
+
<span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_ss'>ss</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
849
|
+
<span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_ss'>ss</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span><span class='period'>.</span><span class='id identifier rubyid_sub'>sub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>doi:</span><span class='regexp_end'>/</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span>
|
850
|
+
<span class='kw'>else</span>
|
851
|
+
<span class='comment'># try prism:doi node
|
852
|
+
</span> <span class='id identifier rubyid_pdoi'>pdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>//rdf:Description//prism:doi</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span>
|
853
|
+
<span class='kw'>if</span> <span class='id identifier rubyid_pdoi'>pdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
|
854
|
+
<span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_pdoi'>pdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
|
855
|
+
<span class='kw'>else</span>
|
856
|
+
<span class='comment'># try pdf:WPS-ARTICLEDOI node
|
857
|
+
</span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>//rdf:Description//pdf:WPS-ARTICLEDOI</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span>
|
858
|
+
<span class='kw'>if</span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
|
859
|
+
<span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
|
860
|
+
<span class='kw'>else</span>
|
861
|
+
<span class='comment'># try pdfx:WPS-ARTICLEDOI node
|
862
|
+
</span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>//rdf:Description//pdfx:WPS-ARTICLEDOI</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span>
|
863
|
+
<span class='kw'>if</span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
|
864
|
+
<span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
|
865
|
+
<span class='kw'>else</span>
|
866
|
+
<span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
|
867
|
+
<span class='kw'>end</span>
|
868
|
+
<span class='kw'>end</span>
|
869
|
+
<span class='kw'>end</span>
|
870
|
+
<span class='kw'>end</span>
|
871
|
+
<span class='kw'>rescue</span>
|
872
|
+
<span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
|
873
|
+
<span class='kw'>end</span>
|
874
|
+
<span class='kw'>end</span>
|
875
|
+
<span class='kw'>end</span>
|
876
|
+
|
877
|
+
<span class='comment'># if not found, try regexing for DOI
|
878
|
+
</span> <span class='kw'>if</span> <span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
|
879
|
+
<span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_get_ids'><span class='object_link'><a href="#get_ids-class_method" title="Extcite.get_ids (method)">get_ids</a></span></span><span class='lparen'>(</span><span class='label'>txt:</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_text_one'>extract_text_one</span><span class='lparen'>(</span><span class='id identifier rubyid_x'>x</span><span class='rparen'>)</span><span class='rparen'>)</span>
|
880
|
+
<span class='kw'>end</span>
|
881
|
+
|
882
|
+
<span class='kw'>return</span> <span class='id identifier rubyid_ids'>ids</span>
|
883
|
+
<span class='kw'>end</span>
|
884
|
+
<span class='kw'>end</span></pre>
|
885
|
+
</td>
|
886
|
+
</tr>
|
887
|
+
</table>
|
888
|
+
</div>
|
889
|
+
|
890
|
+
<div class="method_details ">
|
891
|
+
<h3 class="signature " id="extract_text-class_method">
|
892
|
+
|
893
|
+
.<strong>extract_text</strong>(path:) ⇒ <tt>Object</tt>
|
894
|
+
|
895
|
+
|
896
|
+
|
897
|
+
|
898
|
+
|
899
|
+
</h3><div class="docstring">
|
900
|
+
<div class="discussion">
|
901
|
+
|
902
|
+
<p>Extract text from a pdf, or many pdfs</p>
|
903
|
+
|
904
|
+
<p>This method is used internally within fetch to parse PDFs.</p>
|
905
|
+
|
906
|
+
|
907
|
+
</div>
|
908
|
+
</div>
|
909
|
+
<div class="tags">
|
910
|
+
|
911
|
+
<div class="examples">
|
912
|
+
<p class="tag_title">Examples:</p>
|
913
|
+
|
914
|
+
|
915
|
+
<pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>extcite</span><span class='tstring_end'>'</span></span>
|
916
|
+
<span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>faraday</span><span class='tstring_end'>'</span></span><span class='comment'># get a paper in pdf format
|
917
|
+
</span>
|
918
|
+
<span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>2068.pdf</span><span class='tstring_end'>'</span></span>
|
919
|
+
<span class='id identifier rubyid_res'>res</span> <span class='op'>=</span> <span class='const'>Faraday</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='symbol'>:url</span> <span class='op'>=></span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>https://peerj.com/articles/2068.pdf</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_get'>get</span>
|
920
|
+
<span class='id identifier rubyid_f'>f</span> <span class='op'>=</span> <span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>wb</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span>
|
921
|
+
<span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_write'>write</span><span class='lparen'>(</span><span class='id identifier rubyid_res'>res</span><span class='period'>.</span><span class='id identifier rubyid_body'>body</span><span class='rparen'>)</span>
|
922
|
+
<span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_close'>close</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='comment'># extract doi from the pdf
|
923
|
+
</span>
|
924
|
+
<span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_text'>extract_text</span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span></code></pre>
|
925
|
+
|
926
|
+
</div>
|
927
|
+
<p class="tag_title">Parameters:</p>
|
928
|
+
<ul class="param">
|
929
|
+
|
930
|
+
<li>
|
931
|
+
|
932
|
+
<span class='name'>path</span>
|
933
|
+
|
934
|
+
|
935
|
+
<span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
|
936
|
+
|
937
|
+
|
938
|
+
|
939
|
+
—
|
940
|
+
<div class='inline'>
|
941
|
+
<p>Path to a pdf file, or a folder of PDF files</p>
|
942
|
+
</div>
|
943
|
+
|
944
|
+
</li>
|
945
|
+
|
946
|
+
</ul>
|
947
|
+
|
948
|
+
|
949
|
+
</div><table class="source_code">
|
950
|
+
<tr>
|
951
|
+
<td>
|
952
|
+
<pre class="lines">
|
953
|
+
|
954
|
+
|
955
|
+
263
|
956
|
+
264
|
957
|
+
265
|
958
|
+
266
|
959
|
+
267
|
960
|
+
268
|
961
|
+
269
|
962
|
+
270
|
963
|
+
271
|
964
|
+
272
|
965
|
+
273
|
966
|
+
274
|
967
|
+
275
|
968
|
+
276
|
969
|
+
277
|
970
|
+
278
|
971
|
+
279
|
972
|
+
280
|
973
|
+
281
|
974
|
+
282
|
975
|
+
283
|
976
|
+
284</pre>
|
977
|
+
</td>
|
978
|
+
<td>
|
979
|
+
<pre class="code"><span class="info file"># File 'lib/extcite.rb', line 263</span>
|
980
|
+
|
981
|
+
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_extract_text'>extract_text</span><span class='lparen'>(</span><span class='label'>path:</span><span class='rparen'>)</span>
|
982
|
+
<span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='const'><span class='object_link'><a href="Array.html" title="Array (class)">Array</a></span></span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span>
|
983
|
+
<span class='kw'>if</span> <span class='id identifier rubyid_path'>path</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
|
984
|
+
<span class='kw'>if</span> <span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_directory?'>directory?</span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span><span class='rparen'>)</span> <span class='comment'># keep only files with .pdf extension
|
985
|
+
</span>
|
986
|
+
<span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='id identifier rubyid_dir_files'><span class='object_link'><a href="top-level-namespace.html#dir_files-instance_method" title="#dir_files (method)">dir_files</a></span></span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_keep_if'>keep_if</span> <span class='lbrace'>{</span> <span class='op'>|</span><span class='id identifier rubyid_z'>z</span><span class='op'>|</span> <span class='op'>!</span><span class='op'>!</span><span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>.pdf</span><span class='regexp_end'>/</span></span><span class='rparen'>)</span> <span class='rbrace'>}</span>
|
987
|
+
<span class='kw'>end</span>
|
988
|
+
<span class='kw'>end</span>
|
989
|
+
|
990
|
+
<span class='id identifier rubyid_out'>out</span> <span class='op'>=</span> <span class='lbracket'>[</span><span class='rbracket'>]</span>
|
991
|
+
<span class='id identifier rubyid_path'>path</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_x'>x</span><span class='op'>|</span>
|
992
|
+
<span class='kw'>begin</span>
|
993
|
+
<span class='id identifier rubyid_rr'>rr</span> <span class='op'>=</span> <span class='const'>PDF</span><span class='op'>::</span><span class='const'>Reader</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_x'>x</span><span class='rparen'>)</span>
|
994
|
+
<span class='id identifier rubyid_txt'>txt</span> <span class='op'>=</span> <span class='id identifier rubyid_rr'>rr</span><span class='period'>.</span><span class='id identifier rubyid_pages'>pages</span><span class='period'>.</span><span class='id identifier rubyid_map'>map</span> <span class='lbrace'>{</span> <span class='op'>|</span><span class='id identifier rubyid_page'>page</span><span class='op'>|</span> <span class='id identifier rubyid_page'>page</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span> <span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_join'>join</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_content'>\n</span><span class='tstring_end'>"</span></span><span class='rparen'>)</span>
|
995
|
+
<span class='kw'>rescue</span> <span class='const'>Exception</span> <span class='op'>=></span> <span class='id identifier rubyid_e'>e</span>
|
996
|
+
<span class='id identifier rubyid_warn'>warn</span> <span class='id identifier rubyid_e'>e</span>
|
997
|
+
<span class='id identifier rubyid_txt'>txt</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>"</span><span class='tstring_end'>"</span></span>
|
998
|
+
<span class='kw'>end</span>
|
999
|
+
<span class='id identifier rubyid_out'>out</span> <span class='op'><<</span> <span class='id identifier rubyid_txt'>txt</span>
|
1000
|
+
<span class='kw'>end</span>
|
1001
|
+
<span class='kw'>return</span> <span class='id identifier rubyid_out'>out</span>
|
1002
|
+
<span class='kw'>end</span></pre>
|
1003
|
+
</td>
|
1004
|
+
</tr>
|
1005
|
+
</table>
|
1006
|
+
</div>
|
1007
|
+
|
1008
|
+
<div class="method_details ">
|
1009
|
+
<h3 class="signature " id="get_ids-class_method">
|
1010
|
+
|
1011
|
+
.<strong>get_ids</strong>(txt:) ⇒ <tt>Object</tt>
|
1012
|
+
|
1013
|
+
|
1014
|
+
|
1015
|
+
|
1016
|
+
|
1017
|
+
</h3><div class="docstring">
|
1018
|
+
<div class="discussion">
|
1019
|
+
|
1020
|
+
<p>Get DOIs from a String or Array of String's</p>
|
1021
|
+
|
1022
|
+
<p>Return: Array of DOIs</p>
|
1023
|
+
|
1024
|
+
|
1025
|
+
</div>
|
1026
|
+
</div>
|
1027
|
+
<div class="tags">
|
1028
|
+
|
1029
|
+
<div class="examples">
|
1030
|
+
<p class="tag_title">Examples:</p>
|
1031
|
+
|
1032
|
+
|
1033
|
+
<pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>extcite</span><span class='tstring_end'>'</span></span>
|
1034
|
+
<span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_get_ids'>get_ids</span><span class='lparen'>(</span><span class='label'>txt:</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>10.1016/j.dendro.2014.01.004 adfasdf asd fas df asdfsd</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span></code></pre>
|
1035
|
+
|
1036
|
+
</div>
|
1037
|
+
<p class="tag_title">Parameters:</p>
|
1038
|
+
<ul class="param">
|
1039
|
+
|
1040
|
+
<li>
|
1041
|
+
|
1042
|
+
<span class='name'>txt</span>
|
1043
|
+
|
1044
|
+
|
1045
|
+
<span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
|
1046
|
+
|
1047
|
+
|
1048
|
+
|
1049
|
+
—
|
1050
|
+
<div class='inline'>
|
1051
|
+
<p>String or Array of String's</p>
|
1052
|
+
</div>
|
1053
|
+
|
1054
|
+
</li>
|
1055
|
+
|
1056
|
+
</ul>
|
1057
|
+
|
1058
|
+
|
1059
|
+
</div><table class="source_code">
|
1060
|
+
<tr>
|
1061
|
+
<td>
|
1062
|
+
<pre class="lines">
|
1063
|
+
|
1064
|
+
|
1065
|
+
226
|
1066
|
+
227
|
1067
|
+
228
|
1068
|
+
229
|
1069
|
+
230
|
1070
|
+
231
|
1071
|
+
232
|
1072
|
+
233
|
1073
|
+
234
|
1074
|
+
235
|
1075
|
+
236
|
1076
|
+
237
|
1077
|
+
238
|
1078
|
+
239
|
1079
|
+
240
|
1080
|
+
241
|
1081
|
+
242
|
1082
|
+
243</pre>
|
1083
|
+
</td>
|
1084
|
+
<td>
|
1085
|
+
<pre class="code"><span class="info file"># File 'lib/extcite.rb', line 226</span>
|
1086
|
+
|
1087
|
+
<span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_get_ids'>get_ids</span><span class='lparen'>(</span><span class='label'>txt:</span><span class='rparen'>)</span>
|
1088
|
+
<span class='comment'># see if there's
|
1089
|
+
</span>
|
1090
|
+
<span class='kw'>return</span> <span class='const'><span class='object_link'><a href="Array.html" title="Array (class)">Array</a></span></span><span class='lparen'>(</span><span class='id identifier rubyid_txt'>txt</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_map'>map</span> <span class='lbrace'>{</span> <span class='op'>|</span><span class='id identifier rubyid_z'>z</span><span class='op'>|</span>
|
1091
|
+
<span class='comment'># detect if is an arxiv paper
|
1092
|
+
</span> <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>arxiv:[0-9]+\.[0-9A-Za-z]+</span><span class='regexp_end'>/i</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='comment'># if so, return arxiv id for later extraction of arxiv citation via their API
|
1093
|
+
</span>
|
1094
|
+
<span class='id identifier rubyid_z'>z</span> <span class='op'>=</span> <span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>arxiv:[0-9]+\.[0-9A-Za-z]+</span><span class='regexp_end'>/i</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_to_s'>to_s</span>
|
1095
|
+
<span class='kw'>else</span>
|
1096
|
+
<span class='id identifier rubyid_doi_pattern'>doi_pattern</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_content'>(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%"#? ])\\S)+)</span><span class='tstring_end'>'</span></span>
|
1097
|
+
<span class='id identifier rubyid_z'>z</span> <span class='op'>=</span> <span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='id identifier rubyid_doi_pattern'>doi_pattern</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_to_s'>to_s</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>\s.+</span><span class='regexp_end'>/</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span> <span class='comment'># z = z.match("10\\.[0-9]+/.+").to_s.gsub(/\s.+/, '')
|
1098
|
+
</span>
|
1099
|
+
<span class='kw'>end</span> <span class='comment'># clean up doi
|
1100
|
+
</span>
|
1101
|
+
<span class='id identifier rubyid_z'>z</span> <span class='op'>=</span> <span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>\.$|\.;$|\.\]$|\.\}$|\.\)$|,$</span><span class='regexp_end'>/</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span>
|
1102
|
+
<span class='kw'>return</span> <span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>;$|\]$|\}$|\)$</span><span class='regexp_end'>/</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>'</span><span class='tstring_end'>'</span></span><span class='rparen'>)</span>
|
1103
|
+
<span class='rbrace'>}</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span>
|
1104
|
+
<span class='kw'>end</span></pre>
|
1105
|
+
</td>
|
1106
|
+
</tr>
|
1107
|
+
</table>
|
1108
|
+
</div>
|
1109
|
+
|
1110
|
+
</div>
|
1111
|
+
|
1112
|
+
</div>
|
1113
|
+
|
1114
|
+
<div id="footer">
|
1115
|
+
Generated on Wed Apr 15 16:00:31 2020 by
|
1116
|
+
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
1117
|
+
0.9.24 (ruby-2.7.1).
|
1118
|
+
</div>
|
1119
|
+
|
1120
|
+
</div>
|
1121
|
+
</body>
|
1122
|
+
</html>
|