extcite 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1122 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>
7
+ Module: Extcite
8
+
9
+ &mdash; Documentation by YARD 0.9.24
10
+
11
+ </title>
12
+
13
+ <link rel="stylesheet" href="css/style.css" type="text/css" />
14
+
15
+ <link rel="stylesheet" href="css/common.css" type="text/css" />
16
+
17
+ <script type="text/javascript">
18
+ pathId = "Extcite";
19
+ relpath = '';
20
+ </script>
21
+
22
+
23
+ <script type="text/javascript" charset="utf-8" src="js/jquery.js"></script>
24
+
25
+ <script type="text/javascript" charset="utf-8" src="js/app.js"></script>
26
+
27
+
28
+ </head>
29
+ <body>
30
+ <div class="nav_wrap">
31
+ <iframe id="nav" src="class_list.html?1"></iframe>
32
+ <div id="resizer"></div>
33
+ </div>
34
+
35
+ <div id="main" tabindex="-1">
36
+ <div id="header">
37
+ <div id="menu">
38
+
39
+ <a href="_index.html">Index (E)</a> &raquo;
40
+
41
+
42
+ <span class="title">Extcite</span>
43
+
44
+ </div>
45
+
46
+ <div id="search">
47
+
48
+ <a class="full_list_link" id="class_list_link"
49
+ href="class_list.html">
50
+
51
+ <svg width="24" height="24">
52
+ <rect x="0" y="4" width="24" height="4" rx="1" ry="1"></rect>
53
+ <rect x="0" y="12" width="24" height="4" rx="1" ry="1"></rect>
54
+ <rect x="0" y="20" width="24" height="4" rx="1" ry="1"></rect>
55
+ </svg>
56
+ </a>
57
+
58
+ </div>
59
+ <div class="clear"></div>
60
+ </div>
61
+
62
+ <div id="content"><h1>Module: Extcite
63
+
64
+
65
+
66
+ </h1>
67
+ <div class="box_info">
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+ <dl>
80
+ <dt>Defined in:</dt>
81
+ <dd>lib/extcite.rb<span class="defines">,<br />
82
+ lib/extcite/version.rb</span>
83
+ </dd>
84
+ </dl>
85
+
86
+ </div>
87
+
88
+
89
+
90
+ <h2>
91
+ Constant Summary
92
+ <small><a href="#" class="constants_summary_toggle">collapse</a></small>
93
+ </h2>
94
+
95
+ <dl class="constants">
96
+
97
+ <dt id="VERSION-constant" class="">VERSION =
98
+
99
+ </dt>
100
+ <dd><pre class="code"><span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>0.4.0</span><span class='tstring_end'>&quot;</span></span></pre></dd>
101
+
102
+ </dl>
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+ <h2>
113
+ Class Method Summary
114
+ <small><a href="#" class="summary_toggle">collapse</a></small>
115
+ </h2>
116
+
117
+ <ul class="summary">
118
+
119
+ <li class="public ">
120
+ <span class="summary_signature">
121
+
122
+ <a href="#cont_neg-class_method" title="cont_neg (class method)">.<strong>cont_neg</strong>(ids:) &#x21d2; Object </a>
123
+
124
+
125
+
126
+ </span>
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+ <span class="summary_desc"><div class='inline'>
137
+ <p>Get citation(s) using Crossref content negotation.</p>
138
+ </div></span>
139
+
140
+ </li>
141
+
142
+
143
+ <li class="public ">
144
+ <span class="summary_signature">
145
+
146
+ <a href="#extract-class_method" title="extract (class method)">.<strong>extract</strong>(path:, file: &quot;out.bib&quot;, output: &quot;bib&quot;) &#x21d2; Object </a>
147
+
148
+
149
+
150
+ </span>
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+ <span class="summary_desc"><div class='inline'>
161
+ <p>Extract DOIs from one or more PDFs.</p>
162
+ </div></span>
163
+
164
+ </li>
165
+
166
+
167
+ <li class="public ">
168
+ <span class="summary_signature">
169
+
170
+ <a href="#extract_dois-class_method" title="extract_dois (class method)">.<strong>extract_dois</strong>(path:) &#x21d2; Object </a>
171
+
172
+
173
+
174
+ </span>
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+ <span class="summary_desc"><div class='inline'>
185
+ <p>Extract DOIs from one or more PDFs after extracting text.</p>
186
+ </div></span>
187
+
188
+ </li>
189
+
190
+
191
+ <li class="public ">
192
+ <span class="summary_signature">
193
+
194
+ <a href="#extract_from_metadata-class_method" title="extract_from_metadata (class method)">.<strong>extract_from_metadata</strong>(path:) &#x21d2; Object </a>
195
+
196
+
197
+
198
+ </span>
199
+
200
+
201
+
202
+
203
+
204
+
205
+
206
+
207
+
208
+ <span class="summary_desc"><div class='inline'>
209
+ <p>Try to extract DOIs from one or more PDF metadata sections.</p>
210
+ </div></span>
211
+
212
+ </li>
213
+
214
+
215
+ <li class="public ">
216
+ <span class="summary_signature">
217
+
218
+ <a href="#extract_text-class_method" title="extract_text (class method)">.<strong>extract_text</strong>(path:) &#x21d2; Object </a>
219
+
220
+
221
+
222
+ </span>
223
+
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+ <span class="summary_desc"><div class='inline'>
233
+ <p>Extract text from a pdf, or many pdfs.</p>
234
+ </div></span>
235
+
236
+ </li>
237
+
238
+
239
+ <li class="public ">
240
+ <span class="summary_signature">
241
+
242
+ <a href="#get_ids-class_method" title="get_ids (class method)">.<strong>get_ids</strong>(txt:) &#x21d2; Object </a>
243
+
244
+
245
+
246
+ </span>
247
+
248
+
249
+
250
+
251
+
252
+
253
+
254
+
255
+
256
+ <span class="summary_desc"><div class='inline'>
257
+ <p>Get DOIs from a String or Array of String&#39;s.</p>
258
+ </div></span>
259
+
260
+ </li>
261
+
262
+
263
+ </ul>
264
+
265
+
266
+
267
+
268
+ <div id="class_method_details" class="method_details_list">
269
+ <h2>Class Method Details</h2>
270
+
271
+
272
+ <div class="method_details first">
273
+ <h3 class="signature first" id="cont_neg-class_method">
274
+
275
+ .<strong>cont_neg</strong>(ids:) &#x21d2; <tt>Object</tt>
276
+
277
+
278
+
279
+
280
+
281
+ </h3><div class="docstring">
282
+ <div class="discussion">
283
+
284
+ <p>Get citation(s) using Crossref content negotation</p>
285
+
286
+ <p>Return: an string of bib data</p>
287
+
288
+
289
+ </div>
290
+ </div>
291
+ <div class="tags">
292
+
293
+ <div class="examples">
294
+ <p class="tag_title">Examples:</p>
295
+
296
+
297
+ <pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>extcite</span><span class='tstring_end'>&#39;</span></span>
298
+ <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_cont_neg'>cont_neg</span><span class='lparen'>(</span><span class='label'>ids:</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>10.1016/j.dendro.2014.01.004</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span></code></pre>
299
+
300
+ </div>
301
+ <p class="tag_title">Parameters:</p>
302
+ <ul class="param">
303
+
304
+ <li>
305
+
306
+ <span class='name'>ids</span>
307
+
308
+
309
+ <span class='type'>(<tt><span class='object_link'><a href="Array.html" title="Array (class)">Array</a></span>[<span class='object_link'><a href="String.html" title="String (class)">String</a></span>]</tt>)</span>
310
+
311
+
312
+
313
+ &mdash;
314
+ <div class='inline'>
315
+ <p>One or more DOIs in an array</p>
316
+ </div>
317
+
318
+ </li>
319
+
320
+ </ul>
321
+
322
+
323
+ </div><table class="source_code">
324
+ <tr>
325
+ <td>
326
+ <pre class="lines">
327
+
328
+
329
+ 296
330
+ 297
331
+ 298
332
+ 299</pre>
333
+ </td>
334
+ <td>
335
+ <pre class="code"><span class="info file"># File 'lib/extcite.rb', line 296</span>
336
+
337
+ <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_cont_neg'>cont_neg</span><span class='lparen'>(</span><span class='label'>ids:</span><span class='rparen'>)</span>
338
+ <span class='id identifier rubyid_out'>out</span> <span class='op'>=</span> <span class='const'>Serrano</span><span class='period'>.</span><span class='id identifier rubyid_content_negotiation'>content_negotiation</span><span class='lparen'>(</span><span class='label'>ids:</span> <span class='id identifier rubyid_ids'>ids</span><span class='rparen'>)</span>
339
+ <span class='kw'>return</span> <span class='id identifier rubyid_out'>out</span>
340
+ <span class='kw'>end</span></pre>
341
+ </td>
342
+ </tr>
343
+ </table>
344
+ </div>
345
+
346
+ <div class="method_details ">
347
+ <h3 class="signature " id="extract-class_method">
348
+
349
+ .<strong>extract</strong>(path:, file: &quot;out.bib&quot;, output: &quot;bib&quot;) &#x21d2; <tt>Object</tt>
350
+
351
+
352
+
353
+
354
+
355
+ </h3><div class="docstring">
356
+ <div class="discussion">
357
+
358
+ <p>Extract DOIs from one or more PDFs</p>
359
+
360
+ <p>Return: writes bib files to a .bib file or an array if file is nil</p>
361
+
362
+ <pre class="code ruby"><code class="ruby">When writing to a file, `extract` by default appends to the end
363
+ of the file so you can build up your bibtex file with your
364
+ citations
365
+ </code></pre>
366
+
367
+
368
+ </div>
369
+ </div>
370
+ <div class="tags">
371
+
372
+ <div class="examples">
373
+ <p class="tag_title">Examples:</p>
374
+
375
+
376
+ <pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>extcite</span><span class='tstring_end'>&#39;</span></span>
377
+ <span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>faraday</span><span class='tstring_end'>&#39;</span></span><span class='comment'># get a paper in pdf format
378
+ </span>
379
+ <span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>2068.pdf</span><span class='tstring_end'>&#39;</span></span>
380
+ <span class='id identifier rubyid_res'>res</span> <span class='op'>=</span> <span class='const'>Faraday</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='symbol'>:url</span> <span class='op'>=&gt;</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>https://peerj.com/articles/2068.pdf</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_get'>get</span><span class='semicolon'>;</span>
381
+ <span class='id identifier rubyid_f'>f</span> <span class='op'>=</span> <span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>wb</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span><span class='semicolon'>;</span>
382
+ <span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_write'>write</span><span class='lparen'>(</span><span class='id identifier rubyid_res'>res</span><span class='period'>.</span><span class='id identifier rubyid_body'>body</span><span class='rparen'>)</span><span class='semicolon'>;</span>
383
+ <span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_close'>close</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='comment'># extract doi from the pdf
384
+ </span>
385
+ <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract'>extract</span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span>
386
+ <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract'>extract</span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='comma'>,</span> <span class='label'>file:</span> <span class='kw'>nil</span><span class='rparen'>)</span></code></pre>
387
+
388
+ </div>
389
+ <p class="tag_title">Parameters:</p>
390
+ <ul class="param">
391
+
392
+ <li>
393
+
394
+ <span class='name'>path</span>
395
+
396
+
397
+ <span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
398
+
399
+
400
+
401
+ &mdash;
402
+ <div class='inline'>
403
+ <p>Path to a pdf file, or a folder of PDF files</p>
404
+ </div>
405
+
406
+ </li>
407
+
408
+ <li>
409
+
410
+ <span class='name'>file</span>
411
+
412
+
413
+ <span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
414
+
415
+
416
+ <em class="default">(defaults to: <tt>&quot;out.bib&quot;</tt>)</em>
417
+
418
+
419
+ &mdash;
420
+ <div class='inline'>
421
+ <p>File name to write data to - or nil to stdout</p>
422
+ </div>
423
+
424
+ </li>
425
+
426
+ <li>
427
+
428
+ <span class='name'>output</span>
429
+
430
+
431
+ <span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
432
+
433
+
434
+ <em class="default">(defaults to: <tt>&quot;bib&quot;</tt>)</em>
435
+
436
+
437
+ &mdash;
438
+ <div class='inline'>
439
+ <p>Typeo of output. only bibtex for now</p>
440
+ </div>
441
+
442
+ </li>
443
+
444
+ </ul>
445
+
446
+
447
+ </div><table class="source_code">
448
+ <tr>
449
+ <td>
450
+ <pre class="lines">
451
+
452
+
453
+ 38
454
+ 39
455
+ 40
456
+ 41
457
+ 42
458
+ 43
459
+ 44
460
+ 45
461
+ 46
462
+ 47
463
+ 48
464
+ 49
465
+ 50
466
+ 51
467
+ 52
468
+ 53
469
+ 54
470
+ 55
471
+ 56
472
+ 57
473
+ 58
474
+ 59
475
+ 60
476
+ 61
477
+ 62
478
+ 63
479
+ 64
480
+ 65
481
+ 66
482
+ 67
483
+ 68
484
+ 69
485
+ 70
486
+ 71
487
+ 72
488
+ 73
489
+ 74
490
+ 75
491
+ 76
492
+ 77
493
+ 78
494
+ 79
495
+ 80
496
+ 81
497
+ 82
498
+ 83
499
+ 84
500
+ 85
501
+ 86
502
+ 87
503
+ 88
504
+ 89
505
+ 90
506
+ 91
507
+ 92
508
+ 93
509
+ 94
510
+ 95
511
+ 96
512
+ 97
513
+ 98
514
+ 99
515
+ 100
516
+ 101
517
+ 102
518
+ 103
519
+ 104
520
+ 105
521
+ 106
522
+ 107
523
+ 108
524
+ 109
525
+ 110
526
+ 111
527
+ 112
528
+ 113
529
+ 114
530
+ 115
531
+ 116
532
+ 117</pre>
533
+ </td>
534
+ <td>
535
+ <pre class="code"><span class="info file"># File 'lib/extcite.rb', line 38</span>
536
+
537
+ <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_extract'>extract</span><span class='lparen'>(</span><span class='label'>path:</span><span class='comma'>,</span> <span class='label'>file:</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>out.bib</span><span class='tstring_end'>&quot;</span></span><span class='comma'>,</span> <span class='label'>output:</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>bib</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span>
538
+ <span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='id identifier rubyid_make_paths'><span class='object_link'><a href="top-level-namespace.html#make_paths-instance_method" title="#make_paths (method)">make_paths</a></span></span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span>
539
+ <span class='id identifier rubyid_path'>path</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_x'>x</span><span class='op'>|</span>
540
+ <span class='comment'># try PDF metadata first
541
+ </span> <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
542
+ <span class='id identifier rubyid_rr'>rr</span> <span class='op'>=</span> <span class='const'>PDF</span><span class='op'>::</span><span class='const'>Reader</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_x'>x</span><span class='rparen'>)</span><span class='semicolon'>;</span>
543
+ <span class='id identifier rubyid_pdfmeta'>pdfmeta</span> <span class='op'>=</span> <span class='id identifier rubyid_rr'>rr</span><span class='period'>.</span><span class='id identifier rubyid_metadata'>metadata</span>
544
+ <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_pdfmeta'>pdfmeta</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
545
+ <span class='id identifier rubyid_xml'>xml</span> <span class='op'>=</span> <span class='const'>Oga</span><span class='period'>.</span><span class='id identifier rubyid_parse_xml'>parse_xml</span><span class='lparen'>(</span><span class='id identifier rubyid_pdfmeta'>pdfmeta</span><span class='rparen'>)</span><span class='semicolon'>;</span>
546
+ <span class='kw'>begin</span>
547
+ <span class='id identifier rubyid_tt'>tt</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//rdf:Description</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span> <span class='comment'># try dc:identifier attribute
548
+ </span>
549
+ <span class='id identifier rubyid_ss'>ss</span> <span class='op'>=</span> <span class='id identifier rubyid_tt'>tt</span><span class='period'>.</span><span class='id identifier rubyid_attr'>attr</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>dc:identifier</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span>
550
+ <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_ss'>ss</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
551
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_ss'>ss</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span><span class='period'>.</span><span class='id identifier rubyid_sub'>sub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>doi:</span><span class='regexp_end'>/</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
552
+ <span class='kw'>else</span>
553
+ <span class='comment'># try prism:doi node
554
+ </span> <span class='id identifier rubyid_pdoi'>pdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//rdf:Description//prism:doi</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
555
+ <span class='kw'>if</span> <span class='id identifier rubyid_pdoi'>pdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
556
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_pdoi'>pdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
557
+ <span class='kw'>else</span>
558
+ <span class='comment'># try pdf:WPS-ARTICLEDOI node
559
+ </span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//rdf:Description//pdf:WPS-ARTICLEDOI</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
560
+ <span class='kw'>if</span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
561
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
562
+ <span class='kw'>else</span>
563
+ <span class='comment'># try pdfx:WPS-ARTICLEDOI node
564
+ </span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//rdf:Description//pdfx:WPS-ARTICLEDOI</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
565
+ <span class='kw'>if</span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
566
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
567
+ <span class='kw'>else</span>
568
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
569
+ <span class='kw'>end</span>
570
+ <span class='kw'>end</span>
571
+ <span class='kw'>end</span>
572
+ <span class='kw'>end</span>
573
+ <span class='kw'>rescue</span>
574
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
575
+ <span class='kw'>end</span>
576
+ <span class='kw'>end</span>
577
+
578
+ <span class='comment'># if not found, try regexing for DOI
579
+ </span> <span class='kw'>if</span> <span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
580
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_get_ids'><span class='object_link'><a href="#get_ids-class_method" title="Extcite.get_ids (method)">get_ids</a></span></span><span class='lparen'>(</span><span class='label'>txt:</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_text_one'>extract_text_one</span><span class='lparen'>(</span><span class='id identifier rubyid_x'>x</span><span class='rparen'>)</span><span class='rparen'>)</span>
581
+ <span class='kw'>end</span>
582
+
583
+ <span class='kw'>if</span> <span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>0</span>
584
+ <span class='id identifier rubyid_puts'>puts</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>no DOI found in </span><span class='tstring_end'>&quot;</span></span> <span class='op'>+</span> <span class='id identifier rubyid_x'>x</span>
585
+ <span class='kw'>else</span>
586
+ <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>arxiv</span><span class='regexp_end'>/i</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>&amp;&amp;</span> <span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>&lt;</span> <span class='int'>200</span>
587
+ <span class='id identifier rubyid_conn'>conn</span> <span class='op'>=</span> <span class='const'>Faraday</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='symbol'>:url</span> <span class='op'>=&gt;</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>http://export.arxiv.org/api/query?id_list=</span><span class='tstring_end'>&#39;</span></span> <span class='op'>+</span> <span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>arxiv:</span><span class='regexp_end'>/i</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_get'>get</span>
588
+ <span class='id identifier rubyid_bibs'>bibs</span> <span class='op'>=</span> <span class='id identifier rubyid_conn'>conn</span><span class='period'>.</span><span class='id identifier rubyid_body'>body</span><span class='period'>.</span><span class='id identifier rubyid_make_bib_arxiv'>make_bib_arxiv</span><span class='lparen'>(</span><span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>arxiv:</span><span class='regexp_end'>/i</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='rparen'>)</span>
589
+ <span class='kw'>else</span>
590
+ <span class='id identifier rubyid_bibs'>bibs</span> <span class='op'>=</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_cont_neg'><span class='object_link'><a href="#cont_neg-class_method" title="Extcite.cont_neg (method)">cont_neg</a></span></span><span class='lparen'>(</span><span class='label'>ids:</span> <span class='id identifier rubyid_ids'>ids</span><span class='rparen'>)</span>
591
+ <span class='kw'>end</span>
592
+
593
+ <span class='comment'># if an error or not found, skip
594
+ </span> <span class='id identifier rubyid_bibstest'>bibstest</span> <span class='op'>=</span> <span class='kw'>nil</span>
595
+ <span class='kw'>if</span> <span class='id identifier rubyid_bibs'>bibs</span><span class='period'>.</span><span class='id identifier rubyid_class'>class</span> <span class='op'>==</span> <span class='const'><span class='object_link'><a href="Array.html" title="Array (class)">Array</a></span></span>
596
+ <span class='id identifier rubyid_bibstest'>bibstest</span> <span class='op'>=</span> <span class='id identifier rubyid_bibs'>bibs</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span>
597
+ <span class='kw'>else</span>
598
+ <span class='id identifier rubyid_bibstest'>bibstest</span> <span class='op'>=</span> <span class='id identifier rubyid_bibs'>bibs</span>
599
+ <span class='kw'>end</span>
600
+
601
+ <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_bibstest'>bibstest</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
602
+ <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_bibstest'>bibstest</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>error|not found</span><span class='regexp_end'>/i</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='op'>||</span> <span class='op'>!</span><span class='id identifier rubyid_bibstest'>bibstest</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>&lt;\/html&gt;</span><span class='regexp_end'>/i</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
603
+ <span class='id identifier rubyid_puts'>puts</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>DOI found: </span><span class='tstring_end'>&quot;</span></span> <span class='op'>+</span> <span class='id identifier rubyid_ids'>ids</span> <span class='op'>+</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'> ; but citation not found via content negotation - passing</span><span class='tstring_end'>&quot;</span></span> <span class='comment'># do something else?
604
+ </span>
605
+ <span class='kw'>else</span>
606
+ <span class='kw'>if</span> <span class='id identifier rubyid_file'>file</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
607
+ <span class='kw'>return</span> <span class='id identifier rubyid_bibstest'>bibstest</span>
608
+ <span class='kw'>else</span>
609
+ <span class='id identifier rubyid_puts'>puts</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>writing </span><span class='tstring_end'>&quot;</span></span> <span class='op'>+</span> <span class='id identifier rubyid_ids'>ids</span> <span class='op'>+</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'> to </span><span class='tstring_end'>&quot;</span></span> <span class='op'>+</span> <span class='id identifier rubyid_file'>file</span>
610
+ <span class='id identifier rubyid_bibs'>bibs</span><span class='period'>.</span><span class='id identifier rubyid_write_bib'>write_bib</span><span class='lparen'>(</span><span class='id identifier rubyid_file'>file</span><span class='rparen'>)</span>
611
+ <span class='kw'>end</span>
612
+ <span class='kw'>end</span>
613
+ <span class='kw'>end</span>
614
+ <span class='kw'>end</span>
615
+ <span class='kw'>end</span>
616
+ <span class='kw'>end</span></pre>
617
+ </td>
618
+ </tr>
619
+ </table>
620
+ </div>
621
+
622
+ <div class="method_details ">
623
+ <h3 class="signature " id="extract_dois-class_method">
624
+
625
+ .<strong>extract_dois</strong>(path:) &#x21d2; <tt>Object</tt>
626
+
627
+
628
+
629
+
630
+
631
+ </h3><div class="docstring">
632
+ <div class="discussion">
633
+
634
+ <p>Extract DOIs from one or more PDFs after extracting text</p>
635
+
636
+
637
+ </div>
638
+ </div>
639
+ <div class="tags">
640
+
641
+ <div class="examples">
642
+ <p class="tag_title">Examples:</p>
643
+
644
+
645
+ <pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>extcite</span><span class='tstring_end'>&#39;</span></span>
646
+ <span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>faraday</span><span class='tstring_end'>&#39;</span></span><span class='comment'># get a paper in pdf format
647
+ </span>
648
+ <span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>2068.pdf</span><span class='tstring_end'>&#39;</span></span>
649
+ <span class='id identifier rubyid_res'>res</span> <span class='op'>=</span> <span class='const'>Faraday</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='symbol'>:url</span> <span class='op'>=&gt;</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>https://peerj.com/articles/2068.pdf</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_get'>get</span>
650
+ <span class='id identifier rubyid_f'>f</span> <span class='op'>=</span> <span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>wb</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span>
651
+ <span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_write'>write</span><span class='lparen'>(</span><span class='id identifier rubyid_res'>res</span><span class='period'>.</span><span class='id identifier rubyid_body'>body</span><span class='rparen'>)</span>
652
+ <span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_close'>close</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='comment'># extract doi from the pdf
653
+ </span>
654
+ <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_dois'>extract_dois</span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span></code></pre>
655
+
656
+ </div>
657
+ <p class="tag_title">Parameters:</p>
658
+ <ul class="param">
659
+
660
+ <li>
661
+
662
+ <span class='name'>path</span>
663
+
664
+
665
+ <span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
666
+
667
+
668
+
669
+ &mdash;
670
+ <div class='inline'>
671
+ <p>Path to a pdf file, or a folder of PDF files</p>
672
+ </div>
673
+
674
+ </li>
675
+
676
+ </ul>
677
+
678
+
679
+ </div><table class="source_code">
680
+ <tr>
681
+ <td>
682
+ <pre class="lines">
683
+
684
+
685
+ 210
686
+ 211
687
+ 212
688
+ 213
689
+ 214</pre>
690
+ </td>
691
+ <td>
692
+ <pre class="code"><span class="info file"># File 'lib/extcite.rb', line 210</span>
693
+
694
+ <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_extract_dois'>extract_dois</span><span class='lparen'>(</span><span class='label'>path:</span><span class='rparen'>)</span>
695
+ <span class='id identifier rubyid_txt'>txt</span> <span class='op'>=</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_text'><span class='object_link'><a href="#extract_text-class_method" title="Extcite.extract_text (method)">extract_text</a></span></span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span> <span class='comment'># return txt.map { |z| z.match(&quot;[0-9]+\\.[0-9]+/.+&quot;).to_s.gsub(/\s.+/, &#39;&#39;) }
696
+ </span>
697
+ <span class='kw'>return</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_get_ids'><span class='object_link'><a href="#get_ids-class_method" title="Extcite.get_ids (method)">get_ids</a></span></span><span class='lparen'>(</span><span class='label'>txt:</span> <span class='id identifier rubyid_txt'>txt</span><span class='rparen'>)</span>
698
+ <span class='kw'>end</span></pre>
699
+ </td>
700
+ </tr>
701
+ </table>
702
+ </div>
703
+
704
+ <div class="method_details ">
705
+ <h3 class="signature " id="extract_from_metadata-class_method">
706
+
707
+ .<strong>extract_from_metadata</strong>(path:) &#x21d2; <tt>Object</tt>
708
+
709
+
710
+
711
+
712
+
713
+ </h3><div class="docstring">
714
+ <div class="discussion">
715
+
716
+ <p>Try to extract DOIs from one or more PDF metadata sections</p>
717
+
718
+ <p>Return: DOI string</p>
719
+
720
+
721
+ </div>
722
+ </div>
723
+ <div class="tags">
724
+
725
+ <div class="examples">
726
+ <p class="tag_title">Examples:</p>
727
+
728
+
729
+ <pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>extcite</span><span class='tstring_end'>&#39;</span></span>
730
+ <span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>faraday</span><span class='tstring_end'>&#39;</span></span><span class='comment'># get a paper in pdf format
731
+ </span>
732
+ <span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>2068.pdf</span><span class='tstring_end'>&#39;</span></span>
733
+ <span class='id identifier rubyid_res'>res</span> <span class='op'>=</span> <span class='const'>Faraday</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='symbol'>:url</span> <span class='op'>=&gt;</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>https://peerj.com/articles/2068.pdf</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_get'>get</span><span class='semicolon'>;</span>
734
+ <span class='id identifier rubyid_f'>f</span> <span class='op'>=</span> <span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>wb</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span><span class='semicolon'>;</span>
735
+ <span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_write'>write</span><span class='lparen'>(</span><span class='id identifier rubyid_res'>res</span><span class='period'>.</span><span class='id identifier rubyid_body'>body</span><span class='rparen'>)</span>
736
+ <span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_close'>close</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='comment'># extract doi from the pdf
737
+ </span>
738
+ <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_from_metadata'>extract_from_metadata</span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span></code></pre>
739
+
740
+ </div>
741
+ <p class="tag_title">Parameters:</p>
742
+ <ul class="param">
743
+
744
+ <li>
745
+
746
+ <span class='name'>path</span>
747
+
748
+
749
+ <span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
750
+
751
+
752
+
753
+ &mdash;
754
+ <div class='inline'>
755
+ <p>Path to a pdf file, or a folder of PDF files</p>
756
+ </div>
757
+
758
+ </li>
759
+
760
+ </ul>
761
+
762
+
763
+ </div><table class="source_code">
764
+ <tr>
765
+ <td>
766
+ <pre class="lines">
767
+
768
+
769
+ 137
770
+ 138
771
+ 139
772
+ 140
773
+ 141
774
+ 142
775
+ 143
776
+ 144
777
+ 145
778
+ 146
779
+ 147
780
+ 148
781
+ 149
782
+ 150
783
+ 151
784
+ 152
785
+ 153
786
+ 154
787
+ 155
788
+ 156
789
+ 157
790
+ 158
791
+ 159
792
+ 160
793
+ 161
794
+ 162
795
+ 163
796
+ 164
797
+ 165
798
+ 166
799
+ 167
800
+ 168
801
+ 169
802
+ 170
803
+ 171
804
+ 172
805
+ 173
806
+ 174
807
+ 175
808
+ 176
809
+ 177
810
+ 178
811
+ 179
812
+ 180
813
+ 181
814
+ 182
815
+ 183
816
+ 184
817
+ 185
818
+ 186
819
+ 187
820
+ 188
821
+ 189
822
+ 190
823
+ 191
824
+ 192</pre>
825
+ </td>
826
+ <td>
827
+ <pre class="code"><span class="info file"># File 'lib/extcite.rb', line 137</span>
828
+
829
+ <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_extract_from_metadata'>extract_from_metadata</span><span class='lparen'>(</span><span class='label'>path:</span><span class='rparen'>)</span>
830
+ <span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='id identifier rubyid_make_paths'><span class='object_link'><a href="top-level-namespace.html#make_paths-instance_method" title="#make_paths (method)">make_paths</a></span></span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span>
831
+ <span class='id identifier rubyid_path'>path</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_x'>x</span><span class='op'>|</span>
832
+ <span class='comment'># try PDF metadata first
833
+ </span> <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
834
+ <span class='id identifier rubyid_rr'>rr</span> <span class='op'>=</span> <span class='const'>PDF</span><span class='op'>::</span><span class='const'>Reader</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_x'>x</span><span class='rparen'>)</span>
835
+ <span class='id identifier rubyid_pdfmeta'>pdfmeta</span> <span class='op'>=</span> <span class='id identifier rubyid_rr'>rr</span><span class='period'>.</span><span class='id identifier rubyid_metadata'>metadata</span>
836
+ <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_pdfmeta'>pdfmeta</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
837
+ <span class='kw'>begin</span>
838
+ <span class='id identifier rubyid_xml'>xml</span> <span class='op'>=</span> <span class='const'>Oga</span><span class='period'>.</span><span class='id identifier rubyid_parse_xml'>parse_xml</span><span class='lparen'>(</span><span class='id identifier rubyid_pdfmeta'>pdfmeta</span><span class='rparen'>)</span><span class='semicolon'>;</span>
839
+ <span class='kw'>rescue</span> <span class='const'>Exception</span> <span class='op'>=&gt;</span> <span class='id identifier rubyid_e'>e</span>
840
+ <span class='id identifier rubyid_xml'>xml</span> <span class='op'>=</span> <span class='kw'>nil</span>
841
+ <span class='kw'>end</span>
842
+
843
+ <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
844
+ <span class='kw'>begin</span>
845
+ <span class='id identifier rubyid_tt'>tt</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//rdf:Description</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span> <span class='comment'># try dc:identifier attribute
846
+ </span>
847
+ <span class='id identifier rubyid_ss'>ss</span> <span class='op'>=</span> <span class='id identifier rubyid_tt'>tt</span><span class='period'>.</span><span class='id identifier rubyid_attr'>attr</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>dc:identifier</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span>
848
+ <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_ss'>ss</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
849
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_ss'>ss</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span><span class='period'>.</span><span class='id identifier rubyid_sub'>sub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>doi:</span><span class='regexp_end'>/</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
850
+ <span class='kw'>else</span>
851
+ <span class='comment'># try prism:doi node
852
+ </span> <span class='id identifier rubyid_pdoi'>pdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//rdf:Description//prism:doi</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
853
+ <span class='kw'>if</span> <span class='id identifier rubyid_pdoi'>pdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
854
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_pdoi'>pdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
855
+ <span class='kw'>else</span>
856
+ <span class='comment'># try pdf:WPS-ARTICLEDOI node
857
+ </span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//rdf:Description//pdf:WPS-ARTICLEDOI</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
858
+ <span class='kw'>if</span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
859
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_wpsdoi'>wpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
860
+ <span class='kw'>else</span>
861
+ <span class='comment'># try pdfx:WPS-ARTICLEDOI node
862
+ </span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span> <span class='op'>=</span> <span class='id identifier rubyid_xml'>xml</span><span class='period'>.</span><span class='id identifier rubyid_xpath'>xpath</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>//rdf:Description//pdfx:WPS-ARTICLEDOI</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
863
+ <span class='kw'>if</span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
864
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='id identifier rubyid_pdfxwpsdoi'>pdfxwpsdoi</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span>
865
+ <span class='kw'>else</span>
866
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
867
+ <span class='kw'>end</span>
868
+ <span class='kw'>end</span>
869
+ <span class='kw'>end</span>
870
+ <span class='kw'>end</span>
871
+ <span class='kw'>rescue</span>
872
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='kw'>nil</span>
873
+ <span class='kw'>end</span>
874
+ <span class='kw'>end</span>
875
+ <span class='kw'>end</span>
876
+
877
+ <span class='comment'># if not found, try regexing for DOI
878
+ </span> <span class='kw'>if</span> <span class='id identifier rubyid_ids'>ids</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span>
879
+ <span class='id identifier rubyid_ids'>ids</span> <span class='op'>=</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_get_ids'><span class='object_link'><a href="#get_ids-class_method" title="Extcite.get_ids (method)">get_ids</a></span></span><span class='lparen'>(</span><span class='label'>txt:</span> <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_text_one'>extract_text_one</span><span class='lparen'>(</span><span class='id identifier rubyid_x'>x</span><span class='rparen'>)</span><span class='rparen'>)</span>
880
+ <span class='kw'>end</span>
881
+
882
+ <span class='kw'>return</span> <span class='id identifier rubyid_ids'>ids</span>
883
+ <span class='kw'>end</span>
884
+ <span class='kw'>end</span></pre>
885
+ </td>
886
+ </tr>
887
+ </table>
888
+ </div>
889
+
890
+ <div class="method_details ">
891
+ <h3 class="signature " id="extract_text-class_method">
892
+
893
+ .<strong>extract_text</strong>(path:) &#x21d2; <tt>Object</tt>
894
+
895
+
896
+
897
+
898
+
899
+ </h3><div class="docstring">
900
+ <div class="discussion">
901
+
902
+ <p>Extract text from a pdf, or many pdfs</p>
903
+
904
+ <p>This method is used internally within fetch to parse PDFs.</p>
905
+
906
+
907
+ </div>
908
+ </div>
909
+ <div class="tags">
910
+
911
+ <div class="examples">
912
+ <p class="tag_title">Examples:</p>
913
+
914
+
915
+ <pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>extcite</span><span class='tstring_end'>&#39;</span></span>
916
+ <span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>faraday</span><span class='tstring_end'>&#39;</span></span><span class='comment'># get a paper in pdf format
917
+ </span>
918
+ <span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>2068.pdf</span><span class='tstring_end'>&#39;</span></span>
919
+ <span class='id identifier rubyid_res'>res</span> <span class='op'>=</span> <span class='const'>Faraday</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='symbol'>:url</span> <span class='op'>=&gt;</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>https://peerj.com/articles/2068.pdf</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_get'>get</span>
920
+ <span class='id identifier rubyid_f'>f</span> <span class='op'>=</span> <span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>wb</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span>
921
+ <span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_write'>write</span><span class='lparen'>(</span><span class='id identifier rubyid_res'>res</span><span class='period'>.</span><span class='id identifier rubyid_body'>body</span><span class='rparen'>)</span>
922
+ <span class='id identifier rubyid_f'>f</span><span class='period'>.</span><span class='id identifier rubyid_close'>close</span><span class='lparen'>(</span><span class='rparen'>)</span><span class='comment'># extract doi from the pdf
923
+ </span>
924
+ <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_extract_text'>extract_text</span><span class='lparen'>(</span><span class='label'>path:</span> <span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span></code></pre>
925
+
926
+ </div>
927
+ <p class="tag_title">Parameters:</p>
928
+ <ul class="param">
929
+
930
+ <li>
931
+
932
+ <span class='name'>path</span>
933
+
934
+
935
+ <span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
936
+
937
+
938
+
939
+ &mdash;
940
+ <div class='inline'>
941
+ <p>Path to a pdf file, or a folder of PDF files</p>
942
+ </div>
943
+
944
+ </li>
945
+
946
+ </ul>
947
+
948
+
949
+ </div><table class="source_code">
950
+ <tr>
951
+ <td>
952
+ <pre class="lines">
953
+
954
+
955
+ 263
956
+ 264
957
+ 265
958
+ 266
959
+ 267
960
+ 268
961
+ 269
962
+ 270
963
+ 271
964
+ 272
965
+ 273
966
+ 274
967
+ 275
968
+ 276
969
+ 277
970
+ 278
971
+ 279
972
+ 280
973
+ 281
974
+ 282
975
+ 283
976
+ 284</pre>
977
+ </td>
978
+ <td>
979
+ <pre class="code"><span class="info file"># File 'lib/extcite.rb', line 263</span>
980
+
981
+ <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_extract_text'>extract_text</span><span class='lparen'>(</span><span class='label'>path:</span><span class='rparen'>)</span>
982
+ <span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='const'><span class='object_link'><a href="Array.html" title="Array (class)">Array</a></span></span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='rparen'>)</span>
983
+ <span class='kw'>if</span> <span class='id identifier rubyid_path'>path</span><span class='period'>.</span><span class='id identifier rubyid_length'>length</span> <span class='op'>==</span> <span class='int'>1</span>
984
+ <span class='kw'>if</span> <span class='const'>File</span><span class='period'>.</span><span class='id identifier rubyid_directory?'>directory?</span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span><span class='rparen'>)</span> <span class='comment'># keep only files with .pdf extension
985
+ </span>
986
+ <span class='id identifier rubyid_path'>path</span> <span class='op'>=</span> <span class='id identifier rubyid_dir_files'><span class='object_link'><a href="top-level-namespace.html#dir_files-instance_method" title="#dir_files (method)">dir_files</a></span></span><span class='lparen'>(</span><span class='id identifier rubyid_path'>path</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_keep_if'>keep_if</span> <span class='lbrace'>{</span> <span class='op'>|</span><span class='id identifier rubyid_z'>z</span><span class='op'>|</span> <span class='op'>!</span><span class='op'>!</span><span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>.pdf</span><span class='regexp_end'>/</span></span><span class='rparen'>)</span> <span class='rbrace'>}</span>
987
+ <span class='kw'>end</span>
988
+ <span class='kw'>end</span>
989
+
990
+ <span class='id identifier rubyid_out'>out</span> <span class='op'>=</span> <span class='lbracket'>[</span><span class='rbracket'>]</span>
991
+ <span class='id identifier rubyid_path'>path</span><span class='period'>.</span><span class='id identifier rubyid_each'>each</span> <span class='kw'>do</span> <span class='op'>|</span><span class='id identifier rubyid_x'>x</span><span class='op'>|</span>
992
+ <span class='kw'>begin</span>
993
+ <span class='id identifier rubyid_rr'>rr</span> <span class='op'>=</span> <span class='const'>PDF</span><span class='op'>::</span><span class='const'>Reader</span><span class='period'>.</span><span class='id identifier rubyid_new'>new</span><span class='lparen'>(</span><span class='id identifier rubyid_x'>x</span><span class='rparen'>)</span>
994
+ <span class='id identifier rubyid_txt'>txt</span> <span class='op'>=</span> <span class='id identifier rubyid_rr'>rr</span><span class='period'>.</span><span class='id identifier rubyid_pages'>pages</span><span class='period'>.</span><span class='id identifier rubyid_map'>map</span> <span class='lbrace'>{</span> <span class='op'>|</span><span class='id identifier rubyid_page'>page</span><span class='op'>|</span> <span class='id identifier rubyid_page'>page</span><span class='period'>.</span><span class='id identifier rubyid_text'>text</span> <span class='rbrace'>}</span><span class='period'>.</span><span class='id identifier rubyid_join'>join</span><span class='lparen'>(</span><span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_content'>\n</span><span class='tstring_end'>&quot;</span></span><span class='rparen'>)</span>
995
+ <span class='kw'>rescue</span> <span class='const'>Exception</span> <span class='op'>=&gt;</span> <span class='id identifier rubyid_e'>e</span>
996
+ <span class='id identifier rubyid_warn'>warn</span> <span class='id identifier rubyid_e'>e</span>
997
+ <span class='id identifier rubyid_txt'>txt</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>&quot;</span><span class='tstring_end'>&quot;</span></span>
998
+ <span class='kw'>end</span>
999
+ <span class='id identifier rubyid_out'>out</span> <span class='op'>&lt;&lt;</span> <span class='id identifier rubyid_txt'>txt</span>
1000
+ <span class='kw'>end</span>
1001
+ <span class='kw'>return</span> <span class='id identifier rubyid_out'>out</span>
1002
+ <span class='kw'>end</span></pre>
1003
+ </td>
1004
+ </tr>
1005
+ </table>
1006
+ </div>
1007
+
1008
+ <div class="method_details ">
1009
+ <h3 class="signature " id="get_ids-class_method">
1010
+
1011
+ .<strong>get_ids</strong>(txt:) &#x21d2; <tt>Object</tt>
1012
+
1013
+
1014
+
1015
+
1016
+
1017
+ </h3><div class="docstring">
1018
+ <div class="discussion">
1019
+
1020
+ <p>Get DOIs from a String or Array of String&#39;s</p>
1021
+
1022
+ <p>Return: Array of DOIs</p>
1023
+
1024
+
1025
+ </div>
1026
+ </div>
1027
+ <div class="tags">
1028
+
1029
+ <div class="examples">
1030
+ <p class="tag_title">Examples:</p>
1031
+
1032
+
1033
+ <pre class="example code"><code><span class='id identifier rubyid_require'>require</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>extcite</span><span class='tstring_end'>&#39;</span></span>
1034
+ <span class='const'><span class='object_link'><a href="" title="Extcite (module)">Extcite</a></span></span><span class='period'>.</span><span class='id identifier rubyid_get_ids'>get_ids</span><span class='lparen'>(</span><span class='label'>txt:</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>10.1016/j.dendro.2014.01.004 adfasdf asd fas df asdfsd</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span></code></pre>
1035
+
1036
+ </div>
1037
+ <p class="tag_title">Parameters:</p>
1038
+ <ul class="param">
1039
+
1040
+ <li>
1041
+
1042
+ <span class='name'>txt</span>
1043
+
1044
+
1045
+ <span class='type'>(<tt><span class='object_link'><a href="String.html" title="String (class)">String</a></span></tt>)</span>
1046
+
1047
+
1048
+
1049
+ &mdash;
1050
+ <div class='inline'>
1051
+ <p>String or Array of String&#39;s</p>
1052
+ </div>
1053
+
1054
+ </li>
1055
+
1056
+ </ul>
1057
+
1058
+
1059
+ </div><table class="source_code">
1060
+ <tr>
1061
+ <td>
1062
+ <pre class="lines">
1063
+
1064
+
1065
+ 226
1066
+ 227
1067
+ 228
1068
+ 229
1069
+ 230
1070
+ 231
1071
+ 232
1072
+ 233
1073
+ 234
1074
+ 235
1075
+ 236
1076
+ 237
1077
+ 238
1078
+ 239
1079
+ 240
1080
+ 241
1081
+ 242
1082
+ 243</pre>
1083
+ </td>
1084
+ <td>
1085
+ <pre class="code"><span class="info file"># File 'lib/extcite.rb', line 226</span>
1086
+
1087
+ <span class='kw'>def</span> <span class='kw'>self</span><span class='period'>.</span><span class='id identifier rubyid_get_ids'>get_ids</span><span class='lparen'>(</span><span class='label'>txt:</span><span class='rparen'>)</span>
1088
+ <span class='comment'># see if there&#39;s
1089
+ </span>
1090
+ <span class='kw'>return</span> <span class='const'><span class='object_link'><a href="Array.html" title="Array (class)">Array</a></span></span><span class='lparen'>(</span><span class='id identifier rubyid_txt'>txt</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_map'>map</span> <span class='lbrace'>{</span> <span class='op'>|</span><span class='id identifier rubyid_z'>z</span><span class='op'>|</span>
1091
+ <span class='comment'># detect if is an arxiv paper
1092
+ </span> <span class='kw'>if</span> <span class='op'>!</span><span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>arxiv:[0-9]+\.[0-9A-Za-z]+</span><span class='regexp_end'>/i</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_nil?'>nil?</span> <span class='comment'># if so, return arxiv id for later extraction of arxiv citation via their API
1093
+ </span>
1094
+ <span class='id identifier rubyid_z'>z</span> <span class='op'>=</span> <span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>arxiv:[0-9]+\.[0-9A-Za-z]+</span><span class='regexp_end'>/i</span></span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_to_s'>to_s</span>
1095
+ <span class='kw'>else</span>
1096
+ <span class='id identifier rubyid_doi_pattern'>doi_pattern</span> <span class='op'>=</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_content'>(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%&quot;#? ])\\S)+)</span><span class='tstring_end'>&#39;</span></span>
1097
+ <span class='id identifier rubyid_z'>z</span> <span class='op'>=</span> <span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_match'>match</span><span class='lparen'>(</span><span class='id identifier rubyid_doi_pattern'>doi_pattern</span><span class='rparen'>)</span><span class='period'>.</span><span class='id identifier rubyid_to_s'>to_s</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>\s.+</span><span class='regexp_end'>/</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span> <span class='comment'># z = z.match(&quot;10\\.[0-9]+/.+&quot;).to_s.gsub(/\s.+/, &#39;&#39;)
1098
+ </span>
1099
+ <span class='kw'>end</span> <span class='comment'># clean up doi
1100
+ </span>
1101
+ <span class='id identifier rubyid_z'>z</span> <span class='op'>=</span> <span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>\.$|\.;$|\.\]$|\.\}$|\.\)$|,$</span><span class='regexp_end'>/</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
1102
+ <span class='kw'>return</span> <span class='id identifier rubyid_z'>z</span><span class='period'>.</span><span class='id identifier rubyid_gsub'>gsub</span><span class='lparen'>(</span><span class='tstring'><span class='regexp_beg'>/</span><span class='tstring_content'>;$|\]$|\}$|\)$</span><span class='regexp_end'>/</span></span><span class='comma'>,</span> <span class='tstring'><span class='tstring_beg'>&#39;</span><span class='tstring_end'>&#39;</span></span><span class='rparen'>)</span>
1103
+ <span class='rbrace'>}</span><span class='lbracket'>[</span><span class='int'>0</span><span class='rbracket'>]</span>
1104
+ <span class='kw'>end</span></pre>
1105
+ </td>
1106
+ </tr>
1107
+ </table>
1108
+ </div>
1109
+
1110
+ </div>
1111
+
1112
+ </div>
1113
+
1114
+ <div id="footer">
1115
+ Generated on Wed Apr 15 16:00:31 2020 by
1116
+ <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
1117
+ 0.9.24 (ruby-2.7.1).
1118
+ </div>
1119
+
1120
+ </div>
1121
+ </body>
1122
+ </html>