extcite 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1122 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>
7
+ Module: Extcite
8
+
9
+ &mdash; Documentation by YARD 0.9.24
10
+
11
+ </title>
12
+
13
+ <link rel="stylesheet" href="css/style.css" type="text/css" />
14
+
15
+ <link rel="stylesheet" href="css/common.css" type="text/css" />
16
+
17
+ <script type="text/javascript">
18
+ pathId = "Extcite";
19
+ relpath = '';
20
+ </script>
21
+
22
+
23
+ <script type="text/javascript" charset="utf-8" src="js/jquery.js"></script>
24
+
25
+ <script type="text/javascript" charset="utf-8" src="js/app.js"></script>
26
+
27
+
28
+ </head>
29
+ <body>
30
+ <div class="nav_wrap">
31
+ <iframe id="nav" src="class_list.html?1"></iframe>
32
+ <div id="resizer"></div>
33
+ </div>
34
+
35
+ <div id="main" tabindex="-1">
36
+ <div id="header">
37
+ <div id="menu">
38
+
39
+ <a href="_index.html">Index (E)</a> &raquo;
40
+
41
+
42
+ <span class="title">Extcite</span>
43
+
44
+ </div>
45
+
46
+ <div id="search">
47
+
48
+ <a class="full_list_link" id="class_list_link"
49
+ href="class_list.html">
50
+
51
+ <svg width="24" height="24">
52
+ <rect x="0" y="4" width="24" height="4" rx="1" ry="1"></rect>
53
+ <rect x="0" y="12" width="24" height="4" rx="1" ry="1"></rect>
54
+ <rect x="0" y="20" width="24" height="4" rx="1" ry="1"></rect>
55
+ </svg>
56
+ </a>
57
+
58
+ </div>
59
+ <div class="clear"></div>
60
+ </div>
61
+
62
+ <div id="content"><h1>Module: Extcite
63
+
64
+
65
+
66
+ </h1>
67
+ <div class="box_info">
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+ <dl>
80
+ <dt>Defined in:</dt>
81
+ <dd>lib/extcite.rb<span class="defines">,<br />
82
+ lib/extcite/version.rb</span>
83
+ </dd>
84
+ </dl>
85
+
86
+ </div>
87
+
88
+
89
+
90
+ <h2>
91
+ Constant Summary
92
+ <small><a href="#" class="constants_summary_toggle">collapse</a></small>
93
+ </h2>
94
+
95
+ <dl class="constants">
96
+
97
+ <dt id="VERSION-constant" class="">VERSION =
98
+
99
+ </dt>
100
+ <dd><pre class="code"><span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>0.4.0</span><span class='tstring_end'>&quot;</span></span></pre></dd>
101
+
102
+ </dl>
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+ <h2>
113
+ Class Method Summary
114
+ <small><a href="#" class="summary_toggle">collapse</a></small>
115
+ </h2>
116
+
117
+ <ul class="summary">
118
+
119
+ <li class="public ">
120
+ <span class="summary_signature">
121
+
122
+ <a href="#cont_neg-class_method" title="cont_neg (class method)">.<strong>cont_neg</strong>(ids:) &#x21d2; Object </a>
123
+
124
+
125
+
126
+ </span>
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+ <span class="summary_desc"><div class='inline'>
137
+ <p>Get citation(s) using Crossref content negotation.</p>
138
+ </div></span>
139
+
140
+ </li>
141
+
142
+
143
+ <li class="public ">
144
+ <span class="summary_signature">
145
+
146
+ <a href="#extract-class_method" title="extract (class method)">.<strong>extract</strong>(path:, file: &quot;out.bib&quot;, output: &quot;bib&quot;) &#x21d2; Object </a>
147
+
148
+
149
+
150
+ </span>
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+ <span class="summary_desc"><div class='inline'>
161
+ <p>Extract DOIs from one or more PDFs.</p>
162
+ </div></span>
163
+
164
+ </li>
165
+
166
+
167
+ <li class="public ">
168
+ <span class="summary_signature">
169
+
170
+ <a href="#extract_dois-class_method" title="extract_dois (class method)">.<strong>extract_dois</strong>(path:) &#x21d2; Object </a>
171
+
172
+
173
+
174
+ </span>
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+ <span class="summary_desc"><div class='inline'>
185
+ <p>Extract DOIs from one or more PDFs after extracting text.</p>
186
+ </div></span>
187
+
188
+ </li>
189
+
190
+
191
+ <li class="public ">
192
+ <span class="summary_signature">
193
+
194
+ <a href="#extract_from_metadata-class_method" title="extract_from_metadata (class method)">.<strong>extract_from_metadata</strong>(path:) &#x21d2; Object </a>
195
+
196
+
197
+
198
+ </span>
199
+
200
+
201
+
202
+
203
+
204
+
205
+
206
+
207
+
208
+ <span class="summary_desc"><div class='inline'>
209
+ <p>Try to extract DOIs from one or more PDF metadata sections.</p>
210
+ </div></span>
211
+
212
+ </li>
213
+
214
+
215
+ <li class="public ">
216
+ <span class="summary_signature">
217
+
218
+ <a href="#extract_text-class_method" title="extract_text (class method)">.<strong>extract_text</strong>(path:) &#x21d2; Object </a>
219
+
220
+
221
+
222
+ </span>
223
+
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+ <span class="summary_desc"><div class='inline'>
233
+ <p>Extract text from a pdf, or many pdfs.</p>
234
+ </div></span>
235
+
236
+ </li>
237
+
238
+
239
+ <li class="public ">
240
+ <span class="summary_signature">
241
+
242
+ <a href="#get_ids-class_method" title="get_ids (class method)">.<strong>get_ids</strong>(txt:) &#x21d2; Object </a>
243
+
244
+
245
+
246
+ </span>
247
+
248
+
249
+
250
+
251
+
252
+
253
+
254
+
255
+
256
+ <span class="summary_desc"><div class='inline'>
257
+ <p>Get DOIs from a String or Array of String&#39;s.</p>
258
+ </div></span>
259
+
260
+ </li>
261
+
262
+
263
+ </ul>
264
+
265
+
266
+
267
+
268
+ <div id="class_method_details" class="method_details_list">
269
+ <h2>Class Method Details</h2>
270
+
271
+
272
+ <div class="method_details first">
273
+ <h3 class="signature first" id="cont_neg-class_method">
274
+
275
+ .<strong>cont_neg</strong>(ids:) &#x21d2; <tt>Object</tt>
276
+
277
+
278
+
279
+
280
+
281
+ </h3><div class="docstring">
282
+ <div class="discussion">
283
+
284
+ <p>Get citation(s) using Crossref content negotation</p>
285
+
286
+ <p>Return: an string of bib data</p>
287
+
288
+
289
+ </div>
290
+ </div>
291
+ <div class="tags">
292
+
293
+ <div class="examples">
294
+ <p class="tag_title">Examples:</p>
295
+
296
+
297
+ <pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>extcite</span><span class='tstring_end'>&#39;</span></span>
298
+ <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_cont_neg'>cont_neg</span><span class='lparen'>(</span><span class='label'>ids:</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>10.1016/j.dendro.2014.01.004</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span></code></pre>
299
+
300
+ </div>
301
+ <p class="tag_title">Parameters:</p>
302
+ <ul class="param">
303
+
304
+ <li>
305
+
306
+ <span class='name'>ids</span>
307
+
308
+
309
+ <span class='type'>(<tt><span class='object_link'><a href="Array.html" title="Array (class)">Array</a></span>[<span class='object_link'><a href="String.html" title="String (class)">String</a></span>]</tt>)</span>
310
+
311
+
312
+
313
+ &mdash;
314
+ <div class='inline'>
315
+ <p>One or more DOIs in an array</p>
316
+ </div>
317
+
318
+ </li>
319
+
320
+ </ul>
321
+
322
+
323
+ </div><table class="source_code">
324
+ <tr>
325
+ <td>
326
+ <pre class="lines">
327
+
328
+
329
+ 296
330
+ 297
331
+ 298
332
+ 299</pre>
333
+ </td>
334
+ <td>
335
+ <pre class="code"><span class="info file"># File 'lib/extcite.rb', line 296</span>
336
+
337
+ <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_cont_neg'>cont_neg</span><span class='lparen'>(</span><span class='label'>ids:</span><span class='rparen'>)</span>
338
+ <span class='id identifier rubyid_out'>out</span> <span class='op'>=</span> <span class='const'>Serrano</span><span class='period'>.</span><span class='id identifier rubyid_content_negotiation'>content_negotiation</span><span class='lparen'>(</span><span class='label'>ids:</span> <span class='id identifier rubyid_ids'>ids</span><span class='rparen'>)</span>
339
+ <span class='kw'>return</span> <span class='id identifier rubyid_out'>out</span>
340
+ <span class='kw'>end</span></pre>
341
+ </td>
342
+ </tr>
343
+ </table>
344
+ </div>
345
+
346
+ <div class="method_details ">
347
+ <h3 class="signature " id="extract-class_method">
348
+
349
+ .<strong>extract</strong>(path:, file: &quot;out.bib&quot;, output: &quot;bib&quot;) &#x21d2; <tt>Object</tt>
350
+
351
+
352
+
353
+
354
+
355
+ </h3><div class="docstring">
356
+ <div class="discussion">
357
+
358
+ <p>Extract DOIs from one or more PDFs</p>
359
+
360
+ <p>Return: writes bib files to a .bib file or an array if file is nil</p>
361
+
362
+ <pre class="code ruby"><code class="ruby">When writing to a file, `extract` by default appends to the end
363
+ of the file so you can build up your bibtex file with your
364
+ citations
365
+ </code></pre>
366
+
367
+
368
+ </div>
369
+ </div>
370
+ <div class="tags">
371
+
372
+ <div class="examples">
373
+ <p class="tag_title">Examples:</p>
374
+
375
+
376
+ <pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>extcite</span><span class='tstring_end'>&#39;</span></span>
377
+ <span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>faraday</span><span class='tstring_end'>&#39;</span></span><span class='comment'># get a paper in pdf format
378
+ </span>
379
+ <span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>2068.pdf</span><span class='tstring_end'>&#39;</span></span>
380
+ <span class='id identifier rubyid_res'>res</span> <span class='op'>=</span> <span class='const'>Faraday</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='symbol'>:url</span> <span class='op'>=&gt;</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>https://peerj.com/articles/2068.pdf</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_get'>get</span><span class='semicolon'>;</span>
381
+ <span class='id identifier rubyid_f'>f</span> <span class='op'>=</span> <span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>wb</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span><span class='semicolon'>;</span>
382
+ <span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_write'>write</span><span class='lparen'>(</span><span class='id identifier rubyid_res'>res</span><span class='period'>.</span><span class='id identifier rubyid_body'>body</span><span class='rparen'>)</span><span class='semicolon'>;</span>
383
+ <span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_close'>close</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='comment'># extract doi from the pdf
384
+ </span>
385
+ <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract'>extract</span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span>
386
+ <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract'>extract</span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='comma'>,</span> <span class='label'>file:</span> <span class='kw'>nil</span><span class='rparen'>)</span></code></pre>
387
+
388
+ </div>
389
+ <p class="tag_title">Parameters:</p>
390
+ <ul class="param">
391
+
392
+ <li>
393
+
394
+ <span class='name'>path</span>
395
+
396
+
397
+ <span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
398
+
399
+
400
+
401
+ &mdash;
402
+ <div class='inline'>
403
+ <p>Path to a pdf file, or a folder of PDF files</p>
404
+ </div>
405
+
406
+ </li>
407
+
408
+ <li>
409
+
410
+ <span class='name'>file</span>
411
+
412
+
413
+ <span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
414
+
415
+
416
+ <em class="default">(defaults to: <tt>&quot;out.bib&quot;</tt>)</em>
417
+
418
+
419
+ &mdash;
420
+ <div class='inline'>
421
+ <p>File name to write data to - or nil to stdout</p>
422
+ </div>
423
+
424
+ </li>
425
+
426
+ <li>
427
+
428
+ <span class='name'>output</span>
429
+
430
+
431
+ <span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
432
+
433
+
434
+ <em class="default">(defaults to: <tt>&quot;bib&quot;</tt>)</em>
435
+
436
+
437
+ &mdash;
438
+ <div class='inline'>
439
+ <p>Typeo of output. only bibtex for now</p>
440
+ </div>
441
+
442
+ </li>
443
+
444
+ </ul>
445
+
446
+
447
+ </div><table class="source_code">
448
+ <tr>
449
+ <td>
450
+ <pre class="lines">
451
+
452
+
453
+ 38
454
+ 39
455
+ 40
456
+ 41
457
+ 42
458
+ 43
459
+ 44
460
+ 45
461
+ 46
462
+ 47
463
+ 48
464
+ 49
465
+ 50
466
+ 51
467
+ 52
468
+ 53
469
+ 54
470
+ 55
471
+ 56
472
+ 57
473
+ 58
474
+ 59
475
+ 60
476
+ 61
477
+ 62
478
+ 63
479
+ 64
480
+ 65
481
+ 66
482
+ 67
483
+ 68
484
+ 69
485
+ 70
486
+ 71
487
+ 72
488
+ 73
489
+ 74
490
+ 75
491
+ 76
492
+ 77
493
+ 78
494
+ 79
495
+ 80
496
+ 81
497
+ 82
498
+ 83
499
+ 84
500
+ 85
501
+ 86
502
+ 87
503
+ 88
504
+ 89
505
+ 90
506
+ 91
507
+ 92
508
+ 93
509
+ 94
510
+ 95
511
+ 96
512
+ 97
513
+ 98
514
+ 99
515
+ 100
516
+ 101
517
+ 102
518
+ 103
519
+ 104
520
+ 105
521
+ 106
522
+ 107
523
+ 108
524
+ 109
525
+ 110
526
+ 111
527
+ 112
528
+ 113
529
+ 114
530
+ 115
531
+ 116
532
+ 117</pre>
533
+ </td>
534
+ <td>
535
+ <pre class="code"><span class="info file"># File 'lib/extcite.rb', line 38</span>
536
+
537
+ <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_extract'>extract</span><span class='lparen'>(</span><span class='label'>path:</span><span class='comma'>,</span> <span class='label'>file:</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>out.bib</span><span class='tstring_end'>&quot;</span></span><span class='comma'>,</span> <span class='label'>output:</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>bib</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span>
538
+ <span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='id identifier rubyid_make_paths'><span class='object_link'><a href="top-level-namespace.html#make_paths-instance_method" title="#make_paths (method)">make_paths</a></span></span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span>
539
+ <span class='id identifier rubyid_path'>path</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_x'>x</span><span class='op'>|</span>
540
+ <span class='comment'># try PDF metadata first
541
+ </span> <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
542
+ <span class='id identifier rubyid_rr'>rr</span> <span class='op'>=</span> <span class='const'>PDF</span><span class='op'>::</span><span class='const'>Reader</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_x'>x</span><span class='rparen'>)</span><span class='semicolon'>;</span>
543
+ <span class='id identifier rubyid_pdfmeta'>pdfmeta</span> <span class='op'>=</span> <span class='id identifier rubyid_rr'>rr</span><span class='period'>.</span><span class='id identifier rubyid_metadata'>metadata</span>
544
+ <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_pdfmeta'>pdfmeta</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
545
+ <span class='id identifier rubyid_xml'>xml</span> <span class='op'>=</span> <span class='const'>Oga</span><span class='period'>.</span><span class='id identifier rubyid_parse_xml'>parse_xml</span><span class='lparen'>(</span><span class='id identifier rubyid_pdfmeta'>pdfmeta</span><span class='rparen'>)</span><span class='semicolon'>;</span>
546
+ <span class='kw'>begin</span>
547
+ <span class='id identifier rubyid_tt'>tt</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//rdf:Description</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span> <span class='comment'># try dc:identifier attribute
548
+ </span>
549
+ <span class='id identifier rubyid_ss'>ss</span> <span class='op'>=</span> <span class='id identifier rubyid_tt'>tt</span><span class='period'>.</span><span class='id identifier rubyid_attr'>attr</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>dc:identifier</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span>
550
+ <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_ss'>ss</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
551
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_ss'>ss</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span><span class='period'>.</span><span class='id identifier rubyid_sub'>sub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>doi:</span><span class='regexp_end'>/</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
552
+ <span class='kw'>else</span>
553
+ <span class='comment'># try prism:doi node
554
+ </span> <span class='id identifier rubyid_pdoi'>pdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//rdf:Description//prism:doi</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
555
+ <span class='kw'>if</span> <span class='id identifier rubyid_pdoi'>pdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
556
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_pdoi'>pdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
557
+ <span class='kw'>else</span>
558
+ <span class='comment'># try pdf:WPS-ARTICLEDOI node
559
+ </span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//rdf:Description//pdf:WPS-ARTICLEDOI</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
560
+ <span class='kw'>if</span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
561
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
562
+ <span class='kw'>else</span>
563
+ <span class='comment'># try pdfx:WPS-ARTICLEDOI node
564
+ </span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//rdf:Description//pdfx:WPS-ARTICLEDOI</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
565
+ <span class='kw'>if</span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
566
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
567
+ <span class='kw'>else</span>
568
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
569
+ <span class='kw'>end</span>
570
+ <span class='kw'>end</span>
571
+ <span class='kw'>end</span>
572
+ <span class='kw'>end</span>
573
+ <span class='kw'>rescue</span>
574
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
575
+ <span class='kw'>end</span>
576
+ <span class='kw'>end</span>
577
+
578
+ <span class='comment'># if not found, try regexing for DOI
579
+ </span> <span class='kw'>if</span> <span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
580
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_get_ids'><span class='object_link'><a href="#get_ids-class_method" title="Extcite.get_ids (method)">get_ids</a></span></span><span class='lparen'>(</span><span class='label'>txt:</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_text_one'>extract_text_one</span><span class='lparen'>(</span><span class='id identifier rubyid_x'>x</span><span class='rparen'>)</span><span class='rparen'>)</span>
581
+ <span class='kw'>end</span>
582
+
583
+ <span class='kw'>if</span> <span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>0</span>
584
+ <span class='id identifier rubyid_puts'>puts</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>no DOI found in </span><span class='tstring_end'>&quot;</span></span> <span class='op'>+</span> <span class='id identifier rubyid_x'>x</span>
585
+ <span class='kw'>else</span>
586
+ <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>arxiv</span><span class='regexp_end'>/i</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>&amp;&amp;</span> <span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>&lt;</span> <span class='int'>200</span>
587
+ <span class='id identifier rubyid_conn'>conn</span> <span class='op'>=</span> <span class='const'>Faraday</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='symbol'>:url</span> <span class='op'>=&gt;</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>http://export.arxiv.org/api/query?id_list=</span><span class='tstring_end'>&#39;</span></span> <span class='op'>+</span> <span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>arxiv:</span><span class='regexp_end'>/i</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_get'>get</span>
588
+ <span class='id identifier rubyid_bibs'>bibs</span> <span class='op'>=</span> <span class='id identifier rubyid_conn'>conn</span><span class='period'>.</span><span class='id identifier rubyid_body'>body</span><span class='period'>.</span><span class='id identifier rubyid_make_bib_arxiv'>make_bib_arxiv</span><span class='lparen'>(</span><span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>arxiv:</span><span class='regexp_end'>/i</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='rparen'>)</span>
589
+ <span class='kw'>else</span>
590
+ <span class='id identifier rubyid_bibs'>bibs</span> <span class='op'>=</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_cont_neg'><span class='object_link'><a href="#cont_neg-class_method" title="Extcite.cont_neg (method)">cont_neg</a></span></span><span class='lparen'>(</span><span class='label'>ids:</span> <span class='id identifier rubyid_ids'>ids</span><span class='rparen'>)</span>
591
+ <span class='kw'>end</span>
592
+
593
+ <span class='comment'># if an error or not found, skip
594
+ </span> <span class='id identifier rubyid_bibstest'>bibstest</span> <span class='op'>=</span> <span class='kw'>nil</span>
595
+ <span class='kw'>if</span> <span class='id identifier rubyid_bibs'>bibs</span><span class='period'>.</span><span class='id identifier rubyid_class'>class</span> <span class='op'>==</span> <span class='const'><span class='object_link'><a href="Array.html" title="Array (class)">Array</a></span></span>
596
+ <span class='id identifier rubyid_bibstest'>bibstest</span> <span class='op'>=</span> <span class='id identifier rubyid_bibs'>bibs</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span>
597
+ <span class='kw'>else</span>
598
+ <span class='id identifier rubyid_bibstest'>bibstest</span> <span class='op'>=</span> <span class='id identifier rubyid_bibs'>bibs</span>
599
+ <span class='kw'>end</span>
600
+
601
+ <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_bibstest'>bibstest</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
602
+ <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_bibstest'>bibstest</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>error|not found</span><span class='regexp_end'>/i</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>||</span> <span class='op'>!</span><span class='id identifier rubyid_bibstest'>bibstest</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>&lt;\/html&gt;</span><span class='regexp_end'>/i</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
603
+ <span class='id identifier rubyid_puts'>puts</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>DOI found: </span><span class='tstring_end'>&quot;</span></span> <span class='op'>+</span> <span class='id identifier rubyid_ids'>ids</span> <span class='op'>+</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'> ; but citation not found via content negotation - passing</span><span class='tstring_end'>&quot;</span></span> <span class='comment'># do something else?
604
+ </span>
605
+ <span class='kw'>else</span>
606
+ <span class='kw'>if</span> <span class='id identifier rubyid_file'>file</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
607
+ <span class='kw'>return</span> <span class='id identifier rubyid_bibstest'>bibstest</span>
608
+ <span class='kw'>else</span>
609
+ <span class='id identifier rubyid_puts'>puts</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>writing </span><span class='tstring_end'>&quot;</span></span> <span class='op'>+</span> <span class='id identifier rubyid_ids'>ids</span> <span class='op'>+</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'> to </span><span class='tstring_end'>&quot;</span></span> <span class='op'>+</span> <span class='id identifier rubyid_file'>file</span>
610
+ <span class='id identifier rubyid_bibs'>bibs</span><span class='period'>.</span><span class='id identifier rubyid_write_bib'>write_bib</span><span class='lparen'>(</span><span class='id identifier rubyid_file'>file</span><span class='rparen'>)</span>
611
+ <span class='kw'>end</span>
612
+ <span class='kw'>end</span>
613
+ <span class='kw'>end</span>
614
+ <span class='kw'>end</span>
615
+ <span class='kw'>end</span>
616
+ <span class='kw'>end</span></pre>
617
+ </td>
618
+ </tr>
619
+ </table>
620
+ </div>
621
+
622
+ <div class="method_details ">
623
+ <h3 class="signature " id="extract_dois-class_method">
624
+
625
+ .<strong>extract_dois</strong>(path:) &#x21d2; <tt>Object</tt>
626
+
627
+
628
+
629
+
630
+
631
+ </h3><div class="docstring">
632
+ <div class="discussion">
633
+
634
+ <p>Extract DOIs from one or more PDFs after extracting text</p>
635
+
636
+
637
+ </div>
638
+ </div>
639
+ <div class="tags">
640
+
641
+ <div class="examples">
642
+ <p class="tag_title">Examples:</p>
643
+
644
+
645
+ <pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>extcite</span><span class='tstring_end'>&#39;</span></span>
646
+ <span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>faraday</span><span class='tstring_end'>&#39;</span></span><span class='comment'># get a paper in pdf format
647
+ </span>
648
+ <span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>2068.pdf</span><span class='tstring_end'>&#39;</span></span>
649
+ <span class='id identifier rubyid_res'>res</span> <span class='op'>=</span> <span class='const'>Faraday</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='symbol'>:url</span> <span class='op'>=&gt;</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>https://peerj.com/articles/2068.pdf</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_get'>get</span>
650
+ <span class='id identifier rubyid_f'>f</span> <span class='op'>=</span> <span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>wb</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span>
651
+ <span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_write'>write</span><span class='lparen'>(</span><span class='id identifier rubyid_res'>res</span><span class='period'>.</span><span class='id identifier rubyid_body'>body</span><span class='rparen'>)</span>
652
+ <span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_close'>close</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='comment'># extract doi from the pdf
653
+ </span>
654
+ <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_dois'>extract_dois</span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span></code></pre>
655
+
656
+ </div>
657
+ <p class="tag_title">Parameters:</p>
658
+ <ul class="param">
659
+
660
+ <li>
661
+
662
+ <span class='name'>path</span>
663
+
664
+
665
+ <span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
666
+
667
+
668
+
669
+ &mdash;
670
+ <div class='inline'>
671
+ <p>Path to a pdf file, or a folder of PDF files</p>
672
+ </div>
673
+
674
+ </li>
675
+
676
+ </ul>
677
+
678
+
679
+ </div><table class="source_code">
680
+ <tr>
681
+ <td>
682
+ <pre class="lines">
683
+
684
+
685
+ 210
686
+ 211
687
+ 212
688
+ 213
689
+ 214</pre>
690
+ </td>
691
+ <td>
692
+ <pre class="code"><span class="info file"># File 'lib/extcite.rb', line 210</span>
693
+
694
+ <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_extract_dois'>extract_dois</span><span class='lparen'>(</span><span class='label'>path:</span><span class='rparen'>)</span>
695
+ <span class='id identifier rubyid_txt'>txt</span> <span class='op'>=</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_text'><span class='object_link'><a href="#extract_text-class_method" title="Extcite.extract_text (method)">extract_text</a></span></span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span> <span class='comment'># return txt.map { |z| z.match(&quot;[0-9]+\\.[0-9]+/.+&quot;).to_s.gsub(/\s.+/, &#39;&#39;) }
696
+ </span>
697
+ <span class='kw'>return</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_get_ids'><span class='object_link'><a href="#get_ids-class_method" title="Extcite.get_ids (method)">get_ids</a></span></span><span class='lparen'>(</span><span class='label'>txt:</span> <span class='id identifier rubyid_txt'>txt</span><span class='rparen'>)</span>
698
+ <span class='kw'>end</span></pre>
699
+ </td>
700
+ </tr>
701
+ </table>
702
+ </div>
703
+
704
+ <div class="method_details ">
705
+ <h3 class="signature " id="extract_from_metadata-class_method">
706
+
707
+ .<strong>extract_from_metadata</strong>(path:) &#x21d2; <tt>Object</tt>
708
+
709
+
710
+
711
+
712
+
713
+ </h3><div class="docstring">
714
+ <div class="discussion">
715
+
716
+ <p>Try to extract DOIs from one or more PDF metadata sections</p>
717
+
718
+ <p>Return: DOI string</p>
719
+
720
+
721
+ </div>
722
+ </div>
723
+ <div class="tags">
724
+
725
+ <div class="examples">
726
+ <p class="tag_title">Examples:</p>
727
+
728
+
729
+ <pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>extcite</span><span class='tstring_end'>&#39;</span></span>
730
+ <span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>faraday</span><span class='tstring_end'>&#39;</span></span><span class='comment'># get a paper in pdf format
731
+ </span>
732
+ <span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>2068.pdf</span><span class='tstring_end'>&#39;</span></span>
733
+ <span class='id identifier rubyid_res'>res</span> <span class='op'>=</span> <span class='const'>Faraday</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='symbol'>:url</span> <span class='op'>=&gt;</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>https://peerj.com/articles/2068.pdf</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_get'>get</span><span class='semicolon'>;</span>
734
+ <span class='id identifier rubyid_f'>f</span> <span class='op'>=</span> <span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>wb</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span><span class='semicolon'>;</span>
735
+ <span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_write'>write</span><span class='lparen'>(</span><span class='id identifier rubyid_res'>res</span><span class='period'>.</span><span class='id identifier rubyid_body'>body</span><span class='rparen'>)</span>
736
+ <span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_close'>close</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='comment'># extract doi from the pdf
737
+ </span>
738
+ <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_from_metadata'>extract_from_metadata</span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span></code></pre>
739
+
740
+ </div>
741
+ <p class="tag_title">Parameters:</p>
742
+ <ul class="param">
743
+
744
+ <li>
745
+
746
+ <span class='name'>path</span>
747
+
748
+
749
+ <span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
750
+
751
+
752
+
753
+ &mdash;
754
+ <div class='inline'>
755
+ <p>Path to a pdf file, or a folder of PDF files</p>
756
+ </div>
757
+
758
+ </li>
759
+
760
+ </ul>
761
+
762
+
763
+ </div><table class="source_code">
764
+ <tr>
765
+ <td>
766
+ <pre class="lines">
767
+
768
+
769
+ 137
770
+ 138
771
+ 139
772
+ 140
773
+ 141
774
+ 142
775
+ 143
776
+ 144
777
+ 145
778
+ 146
779
+ 147
780
+ 148
781
+ 149
782
+ 150
783
+ 151
784
+ 152
785
+ 153
786
+ 154
787
+ 155
788
+ 156
789
+ 157
790
+ 158
791
+ 159
792
+ 160
793
+ 161
794
+ 162
795
+ 163
796
+ 164
797
+ 165
798
+ 166
799
+ 167
800
+ 168
801
+ 169
802
+ 170
803
+ 171
804
+ 172
805
+ 173
806
+ 174
807
+ 175
808
+ 176
809
+ 177
810
+ 178
811
+ 179
812
+ 180
813
+ 181
814
+ 182
815
+ 183
816
+ 184
817
+ 185
818
+ 186
819
+ 187
820
+ 188
821
+ 189
822
+ 190
823
+ 191
824
+ 192</pre>
825
+ </td>
826
+ <td>
827
+ <pre class="code"><span class="info file"># File 'lib/extcite.rb', line 137</span>
828
+
829
+ <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_extract_from_metadata'>extract_from_metadata</span><span class='lparen'>(</span><span class='label'>path:</span><span class='rparen'>)</span>
830
+ <span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='id identifier rubyid_make_paths'><span class='object_link'><a href="top-level-namespace.html#make_paths-instance_method" title="#make_paths (method)">make_paths</a></span></span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span>
831
+ <span class='id identifier rubyid_path'>path</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_x'>x</span><span class='op'>|</span>
832
+ <span class='comment'># try PDF metadata first
833
+ </span> <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
834
+ <span class='id identifier rubyid_rr'>rr</span> <span class='op'>=</span> <span class='const'>PDF</span><span class='op'>::</span><span class='const'>Reader</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_x'>x</span><span class='rparen'>)</span>
835
+ <span class='id identifier rubyid_pdfmeta'>pdfmeta</span> <span class='op'>=</span> <span class='id identifier rubyid_rr'>rr</span><span class='period'>.</span><span class='id identifier rubyid_metadata'>metadata</span>
836
+ <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_pdfmeta'>pdfmeta</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
837
+ <span class='kw'>begin</span>
838
+ <span class='id identifier rubyid_xml'>xml</span> <span class='op'>=</span> <span class='const'>Oga</span><span class='period'>.</span><span class='id identifier rubyid_parse_xml'>parse_xml</span><span class='lparen'>(</span><span class='id identifier rubyid_pdfmeta'>pdfmeta</span><span class='rparen'>)</span><span class='semicolon'>;</span>
839
+ <span class='kw'>rescue</span> <span class='const'>Exception</span> <span class='op'>=&gt;</span> <span class='id identifier rubyid_e'>e</span>
840
+ <span class='id identifier rubyid_xml'>xml</span> <span class='op'>=</span> <span class='kw'>nil</span>
841
+ <span class='kw'>end</span>
842
+
843
+ <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
844
+ <span class='kw'>begin</span>
845
+ <span class='id identifier rubyid_tt'>tt</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//rdf:Description</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span> <span class='comment'># try dc:identifier attribute
846
+ </span>
847
+ <span class='id identifier rubyid_ss'>ss</span> <span class='op'>=</span> <span class='id identifier rubyid_tt'>tt</span><span class='period'>.</span><span class='id identifier rubyid_attr'>attr</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>dc:identifier</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span>
848
+ <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_ss'>ss</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
849
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_ss'>ss</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span><span class='period'>.</span><span class='id identifier rubyid_sub'>sub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>doi:</span><span class='regexp_end'>/</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
850
+ <span class='kw'>else</span>
851
+ <span class='comment'># try prism:doi node
852
+ </span> <span class='id identifier rubyid_pdoi'>pdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//rdf:Description//prism:doi</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
853
+ <span class='kw'>if</span> <span class='id identifier rubyid_pdoi'>pdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
854
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_pdoi'>pdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
855
+ <span class='kw'>else</span>
856
+ <span class='comment'># try pdf:WPS-ARTICLEDOI node
857
+ </span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//rdf:Description//pdf:WPS-ARTICLEDOI</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
858
+ <span class='kw'>if</span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
859
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
860
+ <span class='kw'>else</span>
861
+ <span class='comment'># try pdfx:WPS-ARTICLEDOI node
862
+ </span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//rdf:Description//pdfx:WPS-ARTICLEDOI</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
863
+ <span class='kw'>if</span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
864
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
865
+ <span class='kw'>else</span>
866
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
867
+ <span class='kw'>end</span>
868
+ <span class='kw'>end</span>
869
+ <span class='kw'>end</span>
870
+ <span class='kw'>end</span>
871
+ <span class='kw'>rescue</span>
872
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
873
+ <span class='kw'>end</span>
874
+ <span class='kw'>end</span>
875
+ <span class='kw'>end</span>
876
+
877
+ <span class='comment'># if not found, try regexing for DOI
878
+ </span> <span class='kw'>if</span> <span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
879
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_get_ids'><span class='object_link'><a href="#get_ids-class_method" title="Extcite.get_ids (method)">get_ids</a></span></span><span class='lparen'>(</span><span class='label'>txt:</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_text_one'>extract_text_one</span><span class='lparen'>(</span><span class='id identifier rubyid_x'>x</span><span class='rparen'>)</span><span class='rparen'>)</span>
880
+ <span class='kw'>end</span>
881
+
882
+ <span class='kw'>return</span> <span class='id identifier rubyid_ids'>ids</span>
883
+ <span class='kw'>end</span>
884
+ <span class='kw'>end</span></pre>
885
+ </td>
886
+ </tr>
887
+ </table>
888
+ </div>
889
+
890
+ <div class="method_details ">
891
+ <h3 class="signature " id="extract_text-class_method">
892
+
893
+ .<strong>extract_text</strong>(path:) &#x21d2; <tt>Object</tt>
894
+
895
+
896
+
897
+
898
+
899
+ </h3><div class="docstring">
900
+ <div class="discussion">
901
+
902
+ <p>Extract text from a pdf, or many pdfs</p>
903
+
904
+ <p>This method is used internally within fetch to parse PDFs.</p>
905
+
906
+
907
+ </div>
908
+ </div>
909
+ <div class="tags">
910
+
911
+ <div class="examples">
912
+ <p class="tag_title">Examples:</p>
913
+
914
+
915
+ <pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>extcite</span><span class='tstring_end'>&#39;</span></span>
916
+ <span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>faraday</span><span class='tstring_end'>&#39;</span></span><span class='comment'># get a paper in pdf format
917
+ </span>
918
+ <span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>2068.pdf</span><span class='tstring_end'>&#39;</span></span>
919
+ <span class='id identifier rubyid_res'>res</span> <span class='op'>=</span> <span class='const'>Faraday</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='symbol'>:url</span> <span class='op'>=&gt;</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>https://peerj.com/articles/2068.pdf</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_get'>get</span>
920
+ <span class='id identifier rubyid_f'>f</span> <span class='op'>=</span> <span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>wb</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span>
921
+ <span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_write'>write</span><span class='lparen'>(</span><span class='id identifier rubyid_res'>res</span><span class='period'>.</span><span class='id identifier rubyid_body'>body</span><span class='rparen'>)</span>
922
+ <span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_close'>close</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='comment'># extract doi from the pdf
923
+ </span>
924
+ <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_text'>extract_text</span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span></code></pre>
925
+
926
+ </div>
927
+ <p class="tag_title">Parameters:</p>
928
+ <ul class="param">
929
+
930
+ <li>
931
+
932
+ <span class='name'>path</span>
933
+
934
+
935
+ <span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
936
+
937
+
938
+
939
+ &mdash;
940
+ <div class='inline'>
941
+ <p>Path to a pdf file, or a folder of PDF files</p>
942
+ </div>
943
+
944
+ </li>
945
+
946
+ </ul>
947
+
948
+
949
+ </div><table class="source_code">
950
+ <tr>
951
+ <td>
952
+ <pre class="lines">
953
+
954
+
955
+ 263
956
+ 264
957
+ 265
958
+ 266
959
+ 267
960
+ 268
961
+ 269
962
+ 270
963
+ 271
964
+ 272
965
+ 273
966
+ 274
967
+ 275
968
+ 276
969
+ 277
970
+ 278
971
+ 279
972
+ 280
973
+ 281
974
+ 282
975
+ 283
976
+ 284</pre>
977
+ </td>
978
+ <td>
979
+ <pre class="code"><span class="info file"># File 'lib/extcite.rb', line 263</span>
980
+
981
+ <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_extract_text'>extract_text</span><span class='lparen'>(</span><span class='label'>path:</span><span class='rparen'>)</span>
982
+ <span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='const'><span class='object_link'><a href="Array.html" title="Array (class)">Array</a></span></span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span>
983
+ <span class='kw'>if</span> <span class='id identifier rubyid_path'>path</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
984
+ <span class='kw'>if</span> <span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_directory?'>directory?</span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span><span class='rparen'>)</span> <span class='comment'># keep only files with .pdf extension
985
+ </span>
986
+ <span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='id identifier rubyid_dir_files'><span class='object_link'><a href="top-level-namespace.html#dir_files-instance_method" title="#dir_files (method)">dir_files</a></span></span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_keep_if'>keep_if</span> <span class='lbrace'>{</span> <span class='op'>|</span><span class='id identifier rubyid_z'>z</span><span class='op'>|</span> <span class='op'>!</span><span class='op'>!</span><span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>.pdf</span><span class='regexp_end'>/</span></span><span class='rparen'>)</span> <span class='rbrace'>}</span>
987
+ <span class='kw'>end</span>
988
+ <span class='kw'>end</span>
989
+
990
+ <span class='id identifier rubyid_out'>out</span> <span class='op'>=</span> <span class='lbracket'>[</span><span class='rbracket'>]</span>
991
+ <span class='id identifier rubyid_path'>path</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_x'>x</span><span class='op'>|</span>
992
+ <span class='kw'>begin</span>
993
+ <span class='id identifier rubyid_rr'>rr</span> <span class='op'>=</span> <span class='const'>PDF</span><span class='op'>::</span><span class='const'>Reader</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_x'>x</span><span class='rparen'>)</span>
994
+ <span class='id identifier rubyid_txt'>txt</span> <span class='op'>=</span> <span class='id identifier rubyid_rr'>rr</span><span class='period'>.</span><span class='id identifier rubyid_pages'>pages</span><span class='period'>.</span><span class='id identifier rubyid_map'>map</span> <span class='lbrace'>{</span> <span class='op'>|</span><span class='id identifier rubyid_page'>page</span><span class='op'>|</span> <span class='id identifier rubyid_page'>page</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span> <span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_join'>join</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>\n</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span>
995
+ <span class='kw'>rescue</span> <span class='const'>Exception</span> <span class='op'>=&gt;</span> <span class='id identifier rubyid_e'>e</span>
996
+ <span class='id identifier rubyid_warn'>warn</span> <span class='id identifier rubyid_e'>e</span>
997
+ <span class='id identifier rubyid_txt'>txt</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_end'>&quot;</span></span>
998
+ <span class='kw'>end</span>
999
+ <span class='id identifier rubyid_out'>out</span> <span class='op'>&lt;&lt;</span> <span class='id identifier rubyid_txt'>txt</span>
1000
+ <span class='kw'>end</span>
1001
+ <span class='kw'>return</span> <span class='id identifier rubyid_out'>out</span>
1002
+ <span class='kw'>end</span></pre>
1003
+ </td>
1004
+ </tr>
1005
+ </table>
1006
+ </div>
1007
+
1008
+ <div class="method_details ">
1009
+ <h3 class="signature " id="get_ids-class_method">
1010
+
1011
+ .<strong>get_ids</strong>(txt:) &#x21d2; <tt>Object</tt>
1012
+
1013
+
1014
+
1015
+
1016
+
1017
+ </h3><div class="docstring">
1018
+ <div class="discussion">
1019
+
1020
+ <p>Get DOIs from a String or Array of String&#39;s</p>
1021
+
1022
+ <p>Return: Array of DOIs</p>
1023
+
1024
+
1025
+ </div>
1026
+ </div>
1027
+ <div class="tags">
1028
+
1029
+ <div class="examples">
1030
+ <p class="tag_title">Examples:</p>
1031
+
1032
+
1033
+ <pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>extcite</span><span class='tstring_end'>&#39;</span></span>
1034
+ <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_get_ids'>get_ids</span><span class='lparen'>(</span><span class='label'>txt:</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>10.1016/j.dendro.2014.01.004 adfasdf asd fas df asdfsd</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span></code></pre>
1035
+
1036
+ </div>
1037
+ <p class="tag_title">Parameters:</p>
1038
+ <ul class="param">
1039
+
1040
+ <li>
1041
+
1042
+ <span class='name'>txt</span>
1043
+
1044
+
1045
+ <span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
1046
+
1047
+
1048
+
1049
+ &mdash;
1050
+ <div class='inline'>
1051
+ <p>String or Array of String&#39;s</p>
1052
+ </div>
1053
+
1054
+ </li>
1055
+
1056
+ </ul>
1057
+
1058
+
1059
+ </div><table class="source_code">
1060
+ <tr>
1061
+ <td>
1062
+ <pre class="lines">
1063
+
1064
+
1065
+ 226
1066
+ 227
1067
+ 228
1068
+ 229
1069
+ 230
1070
+ 231
1071
+ 232
1072
+ 233
1073
+ 234
1074
+ 235
1075
+ 236
1076
+ 237
1077
+ 238
1078
+ 239
1079
+ 240
1080
+ 241
1081
+ 242
1082
+ 243</pre>
1083
+ </td>
1084
+ <td>
1085
+ <pre class="code"><span class="info file"># File 'lib/extcite.rb', line 226</span>
1086
+
1087
+ <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_get_ids'>get_ids</span><span class='lparen'>(</span><span class='label'>txt:</span><span class='rparen'>)</span>
1088
+ <span class='comment'># see if there&#39;s
1089
+ </span>
1090
+ <span class='kw'>return</span> <span class='const'><span class='object_link'><a href="Array.html" title="Array (class)">Array</a></span></span><span class='lparen'>(</span><span class='id identifier rubyid_txt'>txt</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_map'>map</span> <span class='lbrace'>{</span> <span class='op'>|</span><span class='id identifier rubyid_z'>z</span><span class='op'>|</span>
1091
+ <span class='comment'># detect if is an arxiv paper
1092
+ </span> <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>arxiv:[0-9]+\.[0-9A-Za-z]+</span><span class='regexp_end'>/i</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='comment'># if so, return arxiv id for later extraction of arxiv citation via their API
1093
+ </span>
1094
+ <span class='id identifier rubyid_z'>z</span> <span class='op'>=</span> <span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>arxiv:[0-9]+\.[0-9A-Za-z]+</span><span class='regexp_end'>/i</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_to_s'>to_s</span>
1095
+ <span class='kw'>else</span>
1096
+ <span class='id identifier rubyid_doi_pattern'>doi_pattern</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%&quot;#? ])\\S)+)</span><span class='tstring_end'>&#39;</span></span>
1097
+ <span class='id identifier rubyid_z'>z</span> <span class='op'>=</span> <span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='id identifier rubyid_doi_pattern'>doi_pattern</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_to_s'>to_s</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>\s.+</span><span class='regexp_end'>/</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span> <span class='comment'># z = z.match(&quot;10\\.[0-9]+/.+&quot;).to_s.gsub(/\s.+/, &#39;&#39;)
1098
+ </span>
1099
+ <span class='kw'>end</span> <span class='comment'># clean up doi
1100
+ </span>
1101
+ <span class='id identifier rubyid_z'>z</span> <span class='op'>=</span> <span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>\.$|\.;$|\.\]$|\.\}$|\.\)$|,$</span><span class='regexp_end'>/</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
1102
+ <span class='kw'>return</span> <span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>;$|\]$|\}$|\)$</span><span class='regexp_end'>/</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
1103
+ <span class='rbrace'>}</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span>
1104
+ <span class='kw'>end</span></pre>
1105
+ </td>
1106
+ </tr>
1107
+ </table>
1108
+ </div>
1109
+
1110
+ </div>
1111
+
1112
+ </div>
1113
+
1114
+ <div id="footer">
1115
+ Generated on Wed Apr 15 16:00:31 2020 by
1116
+ <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
1117
+ 0.9.24 (ruby-2.7.1).
1118
+ </div>
1119
+
1120
+ </div>
1121
+ </body>
1122
+ </html>