textminer 0.1.0 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +0 -1
- data/CHANGELOG.md +9 -0
- data/Gemfile.lock +27 -3
- data/README.md +61 -58
- data/Rakefile +17 -2
- data/bin/tm +1 -1
- data/{lib/textminer → extra}/fetch.rb +0 -0
- data/extra/fetch_method.rb +17 -0
- data/lib/textminer.rb +120 -26
- data/lib/textminer/helpers/configuration.rb +26 -0
- data/lib/textminer/link_methods_array.rb +54 -0
- data/lib/textminer/link_methods_hash.rb +71 -0
- data/lib/textminer/mine_utils.rb +65 -0
- data/lib/textminer/mined.rb +31 -0
- data/lib/textminer/miner.rb +42 -0
- data/lib/textminer/request.rb +24 -7
- data/lib/textminer/response.rb +54 -30
- data/lib/textminer/tmutils.rb +7 -0
- data/lib/textminer/version.rb +1 -1
- data/textminer.gemspec +9 -3
- metadata +112 -10
- data/NEWS.md +0 -3
- data/test/test_tdm.rb +0 -52
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c6c80a22022bb38bc141dc50e8da5d913db03946
|
4
|
+
data.tar.gz: 957cf24214f95f1b2d8309f2fd1a2e2aa7b6ca69
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9837bd893866ef35e420d928bf02f3151783b345d39f758ed5ddce8b98c6df92147ff518b889d5cab33f84aa62ad795a3e7e1e2c6ad18cfd7a9a3060589293eb
|
7
|
+
data.tar.gz: 1151759369e8007f85ad73f24872f409ffcb70e99ad114e7a48e623c48a53ea118c7ed13a4ded171fb64823a2e21e342000f77766aaac93b12493335ace58f1d
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
## 0.1.5 (2015-12-04)
|
2
|
+
|
3
|
+
* Now using `serrano` gem for interacting with the Crossref API
|
4
|
+
* Changed `links` method to `search`
|
5
|
+
* Changed `fetch` method to accept a URL for a full text article instead of a DOI
|
6
|
+
|
7
|
+
## 0.1.0 (2015-08-24)
|
8
|
+
|
9
|
+
* First version to Rubygems
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,18 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
textminer (0.1.
|
4
|
+
textminer (0.1.5)
|
5
|
+
faraday (~> 0.9.1)
|
6
|
+
faraday_middleware (~> 0.10.0)
|
5
7
|
httparty (~> 0.13)
|
6
8
|
json (~> 1.8)
|
7
|
-
launchy (~> 2.4, >= 2.4.
|
9
|
+
launchy (~> 2.4, >= 2.4.3)
|
10
|
+
multi_json (~> 1.0)
|
11
|
+
nokogiri (~> 1.6, >= 1.6.6.2)
|
8
12
|
pdf-reader (~> 1.3)
|
13
|
+
serrano (~> 0.1.4.1)
|
9
14
|
thor (~> 0.19)
|
15
|
+
uuidtools (~> 2.1, >= 2.1.5)
|
10
16
|
|
11
17
|
GEM
|
12
18
|
remote: https://rubygems.org/
|
@@ -21,14 +27,23 @@ GEM
|
|
21
27
|
simplecov
|
22
28
|
url
|
23
29
|
docile (1.1.5)
|
30
|
+
faraday (0.9.1)
|
31
|
+
multipart-post (>= 1.2, < 3)
|
32
|
+
faraday_middleware (0.10.0)
|
33
|
+
faraday (>= 0.7.4, < 0.10)
|
24
34
|
hashery (2.1.1)
|
25
|
-
httparty (0.13.
|
35
|
+
httparty (0.13.7)
|
26
36
|
json (~> 1.8)
|
27
37
|
multi_xml (>= 0.5.2)
|
28
38
|
json (1.8.3)
|
29
39
|
launchy (2.4.3)
|
30
40
|
addressable (~> 2.3)
|
41
|
+
mini_portile (0.6.2)
|
42
|
+
multi_json (1.11.2)
|
31
43
|
multi_xml (0.5.5)
|
44
|
+
multipart-post (2.0.0)
|
45
|
+
nokogiri (1.6.6.2)
|
46
|
+
mini_portile (~> 0.6.0)
|
32
47
|
oga (1.2.3)
|
33
48
|
ast
|
34
49
|
ruby-ll (~> 2.1)
|
@@ -44,6 +59,11 @@ GEM
|
|
44
59
|
ansi
|
45
60
|
ast
|
46
61
|
ruby-rc4 (0.1.5)
|
62
|
+
serrano (0.1.4.1)
|
63
|
+
faraday (~> 0.9.1)
|
64
|
+
faraday_middleware (~> 0.10.0)
|
65
|
+
multi_json (~> 1.0)
|
66
|
+
thor (~> 0.19)
|
47
67
|
simplecov (0.10.0)
|
48
68
|
docile (~> 1.1.0)
|
49
69
|
json (~> 1.8)
|
@@ -54,6 +74,7 @@ GEM
|
|
54
74
|
thor (0.19.1)
|
55
75
|
ttfunk (1.4.0)
|
56
76
|
url (0.3.2)
|
77
|
+
uuidtools (2.1.5)
|
57
78
|
|
58
79
|
PLATFORMS
|
59
80
|
ruby
|
@@ -66,3 +87,6 @@ DEPENDENCIES
|
|
66
87
|
simplecov (~> 0.10)
|
67
88
|
test-unit (~> 3.1)
|
68
89
|
textminer!
|
90
|
+
|
91
|
+
BUNDLED WITH
|
92
|
+
1.10.6
|
data/README.md
CHANGED
@@ -1,24 +1,29 @@
|
|
1
1
|
textminer
|
2
2
|
=========
|
3
3
|
|
4
|
-
[![
|
4
|
+
[![gem version](https://img.shields.io/gem/v/textminer.svg)](https://rubygems.org/gems/textminer)
|
5
|
+
[![Build Status](https://travis-ci.org/sckott/textminer.svg?branch=master)](https://travis-ci.org/sckott/textminer)
|
5
6
|
[![codecov.io](http://codecov.io/github/sckott/textminer/coverage.svg?branch=master)](http://codecov.io/github/sckott/textminer?branch=master)
|
6
7
|
|
7
|
-
__This is alpha software, so expect changes__
|
8
|
-
|
9
|
-
## What is it?
|
10
|
-
|
11
8
|
__`textminer` helps you text mine through Crossref's TDM (Text & Data Mining) services:__
|
12
9
|
|
13
10
|
## Changes
|
14
11
|
|
15
|
-
For changes see the [
|
12
|
+
For changes see the [CHANGELOG][changelog]
|
13
|
+
|
14
|
+
## gem API
|
15
|
+
|
16
|
+
* `Textiner.search` - search by DOI, query string, filters, etc. to get Crossref metadata, which you can use downstream to get full text links. This method essentially wraps `Serrano.works()`, but only a subset of params - this interface may change depending on feedback.
|
17
|
+
* `Textiner.fetch` - Fetch full text given a url, supports Crossref's Text and Data Mining service
|
18
|
+
* `Textiner.extract` - Extract text from a pdf
|
16
19
|
|
17
20
|
## Install
|
18
21
|
|
19
22
|
### Release version
|
20
23
|
|
21
|
-
|
24
|
+
```
|
25
|
+
gem install textminer
|
26
|
+
```
|
22
27
|
|
23
28
|
### Development version
|
24
29
|
|
@@ -28,89 +33,87 @@ cd textminer
|
|
28
33
|
rake install
|
29
34
|
```
|
30
35
|
|
31
|
-
##
|
36
|
+
## Examples
|
37
|
+
|
38
|
+
### Within Ruby
|
39
|
+
|
40
|
+
#### Search
|
32
41
|
|
33
42
|
Search by DOI
|
34
43
|
|
35
44
|
```ruby
|
36
45
|
require 'textminer'
|
37
|
-
|
46
|
+
# link to full text available
|
47
|
+
Textminer.search(doi: '10.7554/elife.06430')
|
48
|
+
# no link to full text available
|
49
|
+
Textminer.search(doi: "10.1371/journal.pone.0000308")
|
38
50
|
```
|
39
51
|
|
40
|
-
|
41
|
-
|
42
|
-
```ruby
|
43
|
-
out.pdf
|
44
|
-
```
|
52
|
+
Many DOIs at once
|
45
53
|
|
46
54
|
```ruby
|
47
|
-
|
55
|
+
require 'serrano'
|
56
|
+
dois = Serrano.random_dois(sample: 6)
|
57
|
+
Textminer.search(doi: dois)
|
48
58
|
```
|
49
59
|
|
50
|
-
|
60
|
+
Search with filters
|
51
61
|
|
52
62
|
```ruby
|
53
|
-
|
63
|
+
Textminer.search(filter: {has_full_text: true})
|
54
64
|
```
|
55
65
|
|
56
|
-
|
57
|
-
"http://annalsofpsychoceramics.labs.crossref.org/fulltext/10.5555/515151.xml"
|
58
|
-
```
|
66
|
+
#### Get full text links
|
59
67
|
|
60
|
-
|
68
|
+
The object returned form `Textminer.search` is a class, which has methods for pulling out all links, xml only, pdf only, or plain text only
|
61
69
|
|
62
70
|
```ruby
|
63
|
-
Textminer.
|
71
|
+
x = Textminer.search(filter: {has_full_text: true})
|
72
|
+
x.links_xml
|
73
|
+
x.links_pdf
|
74
|
+
x.links_plain
|
64
75
|
```
|
65
76
|
|
66
|
-
|
67
|
-
=> {"article"=>
|
68
|
-
{"front"=>
|
69
|
-
{"journal_meta"=>
|
70
|
-
{"journal_id"=>
|
71
|
-
{"__content__"=>"PhytoKeys", "journal_id_type"=>"publisher-id"},
|
72
|
-
"journal_title_group"=>
|
73
|
-
{"journal_title"=>{"__content__"=>"PhytoKeys", "lang"=>"en"},
|
74
|
-
"abbrev_journal_title"=>{"__content__"=>"PhytoKeys", "lang"=>"en"}},
|
75
|
-
"issn"=>
|
76
|
-
[{"__content__"=>"1314-2011", "pub_type"=>"ppub"},
|
77
|
-
{"__content__"=>"1314-2003", "pub_type"=>"epub"}],
|
78
|
-
"publisher"=>{"publisher_name"=>"Pensoft Publishers"}},
|
79
|
-
"article_meta"=>
|
80
|
-
|
81
|
-
...
|
82
|
-
```
|
77
|
+
#### Fetch full text
|
83
78
|
|
84
|
-
|
79
|
+
`Textminer.fetch()` gets full text based on URL input. We determine how to pull down and parse the content based on content type.
|
85
80
|
|
86
81
|
```ruby
|
87
|
-
|
82
|
+
# get some metadata
|
83
|
+
res = Textminer.search(member: 2258, filter: {has_full_text: true});
|
84
|
+
# get links
|
85
|
+
links = res.links_xml(true);
|
86
|
+
# Get full text for an article
|
87
|
+
res = Textminer.fetch(url: links[0]);
|
88
|
+
# url
|
89
|
+
res.url
|
90
|
+
# file path
|
91
|
+
res.path
|
92
|
+
# content type
|
93
|
+
res.type
|
94
|
+
# parse content
|
95
|
+
res.parse
|
88
96
|
```
|
89
97
|
|
90
|
-
|
91
|
-
|
92
|
-
## On the CLI
|
98
|
+
#### Extract text from PDF
|
93
99
|
|
94
|
-
|
100
|
+
`Textminer.extract()` extracts text from a pdf, given a path for a pdf
|
95
101
|
|
96
|
-
```
|
97
|
-
|
102
|
+
```ruby
|
103
|
+
res = Textminer.search(member: 2258, filter: {has_full_text: true});
|
104
|
+
links = res.links_pdf(true);
|
105
|
+
res = Textminer.fetch(url: links[0]);
|
106
|
+
Textminer.extract(res.path)
|
98
107
|
```
|
99
108
|
|
100
|
-
|
101
|
-
http://phytokeys.pensoft.net/lib/ajax_srv/article_elements_srv.php?action=download_xml&item_id=4190
|
102
|
-
http://phytokeys.pensoft.net/lib/ajax_srv/article_elements_srv.php?action=download_pdf&item_id=4190
|
103
|
-
```
|
109
|
+
### On the CLI
|
104
110
|
|
105
|
-
|
106
|
-
|
107
|
-
```sh
|
108
|
-
tm links '10.3897/phytokeys.42.7604,10.3897/zookeys.516.9439'
|
109
|
-
```
|
111
|
+
Coming soon...
|
110
112
|
|
111
113
|
## To do
|
112
114
|
|
113
115
|
* CLI executable
|
114
|
-
* get actual full text
|
115
116
|
* better test suite
|
116
|
-
* documentation
|
117
|
+
* better documentation
|
118
|
+
|
119
|
+
[changelog]: https://github.com/sckott/textminer/blob/master/CHANGELOG.md
|
data/Rakefile
CHANGED
@@ -3,20 +3,35 @@ require 'rake/testtask'
|
|
3
3
|
|
4
4
|
Rake::TestTask.new do |t|
|
5
5
|
t.libs << "test"
|
6
|
-
t.test_files = FileList['test/test
|
6
|
+
t.test_files = FileList['test/test-*.rb']
|
7
7
|
t.verbose = true
|
8
8
|
end
|
9
9
|
|
10
10
|
desc "Run tests"
|
11
11
|
task :default => :test
|
12
12
|
|
13
|
+
desc "Build textminer docs"
|
14
|
+
task :docs do
|
15
|
+
system "yardoc"
|
16
|
+
end
|
17
|
+
|
18
|
+
desc "bundle install"
|
19
|
+
task :bundle do
|
20
|
+
system "bundle install"
|
21
|
+
end
|
22
|
+
|
23
|
+
desc "clean out builds"
|
24
|
+
task :clean do
|
25
|
+
system "ls | grep [0-9].gem | xargs rm"
|
26
|
+
end
|
27
|
+
|
13
28
|
desc "Build textminer"
|
14
29
|
task :build do
|
15
30
|
system "gem build textminer.gemspec"
|
16
31
|
end
|
17
32
|
|
18
33
|
desc "Install textminer"
|
19
|
-
task :install => :build do
|
34
|
+
task :install => [:bundle, :build] do
|
20
35
|
system "gem install textminer-#{Textminer::VERSION}.gem"
|
21
36
|
end
|
22
37
|
|
data/bin/tm
CHANGED
File without changes
|
@@ -0,0 +1,17 @@
|
|
1
|
+
##
|
2
|
+
# Thin layer around pdf-reader gem's PDF::Reader
|
3
|
+
#
|
4
|
+
# @param doi [Array] A DOI, digital object identifier
|
5
|
+
# @param type [Array] One of two options to download: xml (default) or pdf
|
6
|
+
#
|
7
|
+
# @example
|
8
|
+
# require 'textminer'
|
9
|
+
# # fetch full text by DOI - xml by default
|
10
|
+
# Textminer.fetch("10.3897/phytokeys.42.7604")
|
11
|
+
# # many DOIs - xml output
|
12
|
+
# res = Textminer.fetch(["10.3897/phytokeys.42.7604", "10.3897/zookeys.516.9439"])
|
13
|
+
# # fetch full text - pdf
|
14
|
+
# Textminer.fetch("10.3897/phytokeys.42.7604", "pdf")
|
15
|
+
def self.fetch(doi, type = 'xml')
|
16
|
+
Fetch.new(doi, type).fetchtext
|
17
|
+
end
|
data/lib/textminer.rb
CHANGED
@@ -1,49 +1,124 @@
|
|
1
1
|
require 'httparty'
|
2
2
|
require 'json'
|
3
3
|
require 'pdf-reader'
|
4
|
+
require 'serrano'
|
5
|
+
require "textminer/miner"
|
4
6
|
require "textminer/version"
|
5
7
|
require "textminer/request"
|
6
8
|
require "textminer/response"
|
7
|
-
require "textminer/fetch"
|
8
9
|
|
9
10
|
module Textminer
|
11
|
+
extend Configuration
|
12
|
+
|
13
|
+
define_setting :tdm_key
|
14
|
+
|
10
15
|
##
|
11
|
-
#
|
16
|
+
# Search for papers and get full text links
|
12
17
|
#
|
13
18
|
# @param doi [Array] A DOI, digital object identifier
|
19
|
+
# @param options [Array] Curl request options
|
14
20
|
# @return [Array] the output
|
15
21
|
#
|
16
22
|
# @example
|
17
23
|
# require 'textminer'
|
18
24
|
# # link to full text available
|
19
|
-
# Textminer.
|
25
|
+
# Textminer.search(doi: '10.3897/phytokeys.42.7604')
|
20
26
|
# # no link to full text available
|
21
|
-
# Textminer.
|
27
|
+
# Textminer.search(doi: "10.1371/journal.pone.0000308")
|
22
28
|
# # many DOIs at once
|
23
|
-
#
|
29
|
+
# require 'serrano'
|
30
|
+
# dois = Serrano.random_dois(sample: 6)
|
31
|
+
# res = Textminer.search(doi: dois)
|
32
|
+
# res = Textminer.search(doi: ["10.3897/phytokeys.42.7604", "10.3897/zookeys.516.9439"])
|
24
33
|
# res.links
|
25
|
-
# res.
|
26
|
-
# res.
|
27
|
-
|
28
|
-
|
34
|
+
# res.links_pdf
|
35
|
+
# res.links_xml
|
36
|
+
# res.links_plain
|
37
|
+
# # only full text available
|
38
|
+
# x = Textminer.search(doi: '10.3816/clm.2001.n.006')
|
39
|
+
# x.links_xml
|
40
|
+
# x.links_plain
|
41
|
+
# x.links_pdf
|
42
|
+
# # no dois
|
43
|
+
# x = Textminer.search(filter: {has_full_text: true})
|
44
|
+
# x.links_xml
|
45
|
+
# x.links_plain
|
46
|
+
# x = Textminer.search(member: 311, filter: {has_full_text: true})
|
47
|
+
# x.links_pdf
|
48
|
+
def self.search(doi: nil, member: nil, filter: nil, limit: nil, options: nil)
|
49
|
+
Request.new(doi, member, filter, limit, options).perform
|
29
50
|
end
|
30
51
|
|
31
52
|
##
|
32
|
-
#
|
53
|
+
# Get full text
|
33
54
|
#
|
34
|
-
#
|
35
|
-
#
|
55
|
+
# Work easily for open access papers, but for closed. For non-OA papers, use
|
56
|
+
# Crossref's Text and Data Mining service, which requires authentication and
|
57
|
+
# pre-authorized IP address. Go to https://apps.crossref.org/clickthrough/researchers
|
58
|
+
# to sign up for the TDM service, to get your key. The only publishers
|
59
|
+
# taking part at this time are Elsevier and Wiley.
|
60
|
+
#
|
61
|
+
# @param url [String] A url for full text
|
62
|
+
# @return [Mined] An object of class Mined, with methods for extracting
|
63
|
+
# the url requested, the file path, and parsing the plain text, XML, or extracting
|
64
|
+
# text from the pdf.
|
36
65
|
#
|
37
66
|
# @example
|
38
|
-
#
|
39
|
-
#
|
40
|
-
#
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
|
46
|
-
|
67
|
+
# require 'textminer'
|
68
|
+
# # Set authorization
|
69
|
+
# Textminer.configuration do |config|
|
70
|
+
# config.tdm_key = "<your key>"
|
71
|
+
# end
|
72
|
+
# # Get some elsevier works
|
73
|
+
# res = Textminer.search(member: 78, filter: {has_full_text: true});
|
74
|
+
# links = res.links_xml(true);
|
75
|
+
# # Get full text for an article
|
76
|
+
# out = Textminer.fetch(url: links[0]);
|
77
|
+
# out.url
|
78
|
+
# out.path
|
79
|
+
# out.type
|
80
|
+
# xml = out.parse()
|
81
|
+
# puts xml
|
82
|
+
# xml.xpath('//xocs:cover-date-text', xml.root.namespaces).text
|
83
|
+
# # Get lots of articles
|
84
|
+
# links = links[1..3]
|
85
|
+
# out = links.collect{ |x| Textminer.fetch(url: x) }
|
86
|
+
# out.collect{ |z| z.path }
|
87
|
+
# out.collect{ |z| z.parse }
|
88
|
+
# zz = out[0].parse
|
89
|
+
# zz.xpath('//xocs:cover-date-text', zz.root.namespaces).text
|
90
|
+
#
|
91
|
+
# ## plain text
|
92
|
+
# # get full text links, here doing xml
|
93
|
+
# links = res.links_plain(true);
|
94
|
+
# # Get full text for an article
|
95
|
+
# res = Textminer.fetch(url: links[0]);
|
96
|
+
# res.url
|
97
|
+
# res.parse
|
98
|
+
#
|
99
|
+
# # With open access content - using Pensoft
|
100
|
+
# res = Textminer.search(member: 2258, filter: {has_full_text: true});
|
101
|
+
# links = res.links_xml(true);
|
102
|
+
# # Get full text for an article
|
103
|
+
# res = Textminer.fetch(url: links[0]);
|
104
|
+
# res.url
|
105
|
+
# res.parse
|
106
|
+
#
|
107
|
+
# # OA content - pdfs, using pensoft again
|
108
|
+
# res = Textminer.search(member: 2258, filter: {has_full_text: true});
|
109
|
+
# links = res.links_pdf(true);
|
110
|
+
# # Get full text for an article
|
111
|
+
# res = Textminer.fetch(url: links[0]);
|
112
|
+
# # url used
|
113
|
+
# res.url
|
114
|
+
# # document type
|
115
|
+
# res.type
|
116
|
+
# # document path on your machine
|
117
|
+
# res.path
|
118
|
+
# # get text
|
119
|
+
# res.parse
|
120
|
+
def self.fetch(url)
|
121
|
+
Miner.new(url).perform
|
47
122
|
end
|
48
123
|
|
49
124
|
##
|
@@ -52,15 +127,34 @@ module Textminer
|
|
52
127
|
# @param path [String] Path to a pdf file downloaded via {fetch}, or
|
53
128
|
# another way.
|
54
129
|
#
|
130
|
+
# This method is used internally within fetch to parse PDFs.
|
131
|
+
#
|
55
132
|
# @example
|
56
|
-
#
|
57
|
-
#
|
58
|
-
#
|
59
|
-
#
|
60
|
-
#
|
133
|
+
# require 'textminer'
|
134
|
+
# res = Textminer.search(member: 2258, filter: {has_full_text: true});
|
135
|
+
# links = res.links_pdf(true);
|
136
|
+
# # Get full text for an article
|
137
|
+
# out = Textminer.fetch(url: links[0]);
|
138
|
+
# # extract pdf to text
|
139
|
+
# Textminer.extract(out.path)
|
61
140
|
def self.extract(path)
|
62
141
|
rr = PDF::Reader.new(path)
|
63
142
|
rr.pages.map { |page| page.text }.join("\n")
|
64
143
|
end
|
65
144
|
|
145
|
+
protected
|
146
|
+
|
147
|
+
def self.link_switch(x, y)
|
148
|
+
case y
|
149
|
+
when nil
|
150
|
+
x.links
|
151
|
+
when 'xml'
|
152
|
+
x.links_xml
|
153
|
+
when 'pdf'
|
154
|
+
x.links_pdf
|
155
|
+
when 'plain'
|
156
|
+
x.links_plain
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
66
160
|
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# taken from: https://viget.com/extend/easy-gem-configuration-variables-with-defaults
|
2
|
+
module Configuration
|
3
|
+
|
4
|
+
def configuration
|
5
|
+
yield self
|
6
|
+
end
|
7
|
+
|
8
|
+
def define_setting(name, default = nil)
|
9
|
+
class_variable_set("@@#{name}", default)
|
10
|
+
define_class_method "#{name}=" do |value|
|
11
|
+
class_variable_set("@@#{name}", value)
|
12
|
+
end
|
13
|
+
define_class_method name do
|
14
|
+
class_variable_get("@@#{name}")
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def define_class_method(name, &block)
|
21
|
+
(class << self; self; end).instance_eval do
|
22
|
+
define_method name, &block
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# Array methods
|
2
|
+
class Array
|
3
|
+
def links(just_urls = true)
|
4
|
+
return self.collect{ |x| x.links(just_urls) }.flatten
|
5
|
+
# if temp.length == 1
|
6
|
+
# return tmp[0]
|
7
|
+
# else
|
8
|
+
# return tmp
|
9
|
+
# end
|
10
|
+
# tmp = self.collect{ |x| x['message']['link'] }
|
11
|
+
# return parse_link(tmp, just_urls)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class Array
|
16
|
+
def links_xml(just_urls = true)
|
17
|
+
self.collect { |z| z.links_xml(just_urls) }[0]
|
18
|
+
# return parse_link(self.collect { |z| z.links_xml }[0], just_urls)
|
19
|
+
# return parse_link(pull_link(self, '^application\/xml$|^text\/xml$'), just_urls)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class Array
|
24
|
+
def links_pdf(just_urls = true)
|
25
|
+
self.collect { |z| z.links_pdf(just_urls) }[0]
|
26
|
+
# return parse_link(self.collect { |z| z.links_pdf }[0], just_urls)
|
27
|
+
# return parse_link(pull_link(self, '^application\/pdf$'), just_urls)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class Array
|
32
|
+
def links_plain(just_urls = true)
|
33
|
+
self.collect { |z| z.links_plain(just_urls) }[0]
|
34
|
+
# return parse_link(self.collect { |z| z.links_plain }[0], just_urls)
|
35
|
+
# return parse_link(pull_link(self, '^application\/plain$|^text\/plain$'), just_urls)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# def pull_link(x, y)
|
40
|
+
# return x.collect { |z| z.links_xml }[0]
|
41
|
+
# # return x.collect { |z| z['message']['link'] }.compact.collect { |z| z.compact.select { |w| w['content-type'].match(/#{y}/) } }
|
42
|
+
# end
|
43
|
+
|
44
|
+
# def parse_link(x, just_urls)
|
45
|
+
# if x.nil?
|
46
|
+
# return x
|
47
|
+
# else
|
48
|
+
# if just_urls
|
49
|
+
# return x.compact.collect { |z| z.collect{ |y| y['URL'] }}.flatten
|
50
|
+
# else
|
51
|
+
# return x
|
52
|
+
# end
|
53
|
+
# end
|
54
|
+
# end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# Hash methods
|
2
|
+
class Hash
|
3
|
+
def links(just_urls = true)
|
4
|
+
if self['message']['items'].nil?
|
5
|
+
tmp = self['message']['link']
|
6
|
+
if tmp.nil?
|
7
|
+
tmp = nil
|
8
|
+
else
|
9
|
+
tmp = tmp.reject { |c| c.empty? }
|
10
|
+
end
|
11
|
+
else
|
12
|
+
tmp = self['message']['items'].collect { |x| x['link'] }.reject { |c| c.empty? }
|
13
|
+
end
|
14
|
+
|
15
|
+
return parse_links(tmp, just_urls)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class Hash
|
20
|
+
def links_xml(just_urls = true)
|
21
|
+
return parse_links(pull_links(self, '^application\/xml$|^text\/xml$'), just_urls)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class Hash
|
26
|
+
def links_pdf(just_urls = true)
|
27
|
+
return parse_links(pull_links(self, '^application\/pdf$'), just_urls)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class Hash
|
32
|
+
def links_plain(just_urls = true)
|
33
|
+
return parse_links(pull_links(self, '^application\/plain$|^text\/plain$'), just_urls)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def pull_links(x, y)
|
38
|
+
if x['message']['items'].nil?
|
39
|
+
tmp = self['message']['link']
|
40
|
+
if tmp.nil?
|
41
|
+
return nil
|
42
|
+
else
|
43
|
+
return tmp.select { |z| z['content-type'].match(/#{y}/) }.reject { |c| c.empty? }
|
44
|
+
end
|
45
|
+
else
|
46
|
+
return x['message']['items'].collect { |x| x['link'].select { |z| z['content-type'].match(/#{y}/) } }.reject { |c| c.empty? }
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def parse_links(x, just_urls)
|
51
|
+
if x.nil?
|
52
|
+
return nil
|
53
|
+
else
|
54
|
+
if x.empty?
|
55
|
+
return x
|
56
|
+
else
|
57
|
+
if just_urls
|
58
|
+
if x[0].class != Array
|
59
|
+
# return x[0]['URL']
|
60
|
+
return x.collect { |x| x['URL'] }.flatten
|
61
|
+
else
|
62
|
+
return x.collect { |x| x.collect { |z| z['URL'] }}.flatten
|
63
|
+
# return x.collect { |x| x['URL'] }.flatten.compact
|
64
|
+
# return x.collect { |x| x.collect { |z| z['URL'] }}.flatten
|
65
|
+
end
|
66
|
+
else
|
67
|
+
return x
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'uuidtools'
|
3
|
+
|
4
|
+
def detect_type(x)
|
5
|
+
ctype = x.headers['content-type']
|
6
|
+
case ctype
|
7
|
+
when 'text/xml'
|
8
|
+
'xml'
|
9
|
+
when 'text/plain'
|
10
|
+
'plain'
|
11
|
+
when 'application/pdf'
|
12
|
+
'pdf'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def make_ext(x)
|
17
|
+
case x
|
18
|
+
when 'xml'
|
19
|
+
'xml'
|
20
|
+
when 'plain'
|
21
|
+
'txt'
|
22
|
+
when 'pdf'
|
23
|
+
'pdf'
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def make_path(type)
|
28
|
+
# id = x.split('article/')[1].split('?')[0]
|
29
|
+
# path = id + '.' + type
|
30
|
+
# return path
|
31
|
+
type = make_ext(type)
|
32
|
+
uuid = UUIDTools::UUID.random_create.to_s
|
33
|
+
path = uuid + '.' + type
|
34
|
+
return path
|
35
|
+
end
|
36
|
+
|
37
|
+
def write_disk(res, path)
|
38
|
+
f = File.new(path, "wb")
|
39
|
+
f.write(res.body)
|
40
|
+
f.close()
|
41
|
+
end
|
42
|
+
|
43
|
+
def read_disk(path)
|
44
|
+
return File.read(path)
|
45
|
+
end
|
46
|
+
|
47
|
+
def parse_xml(x)
|
48
|
+
text = read_disk(x)
|
49
|
+
xml = Nokogiri.parse(text)
|
50
|
+
return xml
|
51
|
+
end
|
52
|
+
|
53
|
+
def parse_plain(x)
|
54
|
+
text = read_disk(x)
|
55
|
+
return text
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse_pdf(x)
|
59
|
+
return Textminer.extract(x)
|
60
|
+
end
|
61
|
+
|
62
|
+
def is_elsevier_wiley(x)
|
63
|
+
tmp = x.match 'elsevier|wiley'
|
64
|
+
!tmp.nil?
|
65
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
|
3
|
+
##
|
4
|
+
# Textminer::Mined
|
5
|
+
#
|
6
|
+
# Class to give back text mining object
|
7
|
+
module Textminer
|
8
|
+
class Mined #:nodoc:
|
9
|
+
attr_accessor :url
|
10
|
+
attr_accessor :path
|
11
|
+
attr_accessor :type
|
12
|
+
|
13
|
+
def initialize(url, path, type)
|
14
|
+
self.url = url
|
15
|
+
self.path = path
|
16
|
+
self.type = type
|
17
|
+
end
|
18
|
+
|
19
|
+
def parse
|
20
|
+
case self.type
|
21
|
+
when 'xml'
|
22
|
+
parse_xml(self.path)
|
23
|
+
when 'plain'
|
24
|
+
parse_plain(self.path)
|
25
|
+
when 'pdf'
|
26
|
+
parse_pdf(self.path)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require "faraday"
|
2
|
+
require "faraday_middleware"
|
3
|
+
require "multi_json"
|
4
|
+
require 'textminer/helpers/configuration'
|
5
|
+
require 'textminer/mined'
|
6
|
+
require 'textminer/mine_utils'
|
7
|
+
|
8
|
+
##
|
9
|
+
# Textminer::Miner
|
10
|
+
#
|
11
|
+
# Class to give back text mining object
|
12
|
+
module Textminer
|
13
|
+
class Miner #:nodoc:
|
14
|
+
attr_accessor :url
|
15
|
+
|
16
|
+
def initialize(url)
|
17
|
+
self.url = url
|
18
|
+
end
|
19
|
+
|
20
|
+
def perform
|
21
|
+
conn = Faraday.new self.url do |c|
|
22
|
+
c.use FaradayMiddleware::FollowRedirects
|
23
|
+
c.adapter :net_http
|
24
|
+
end
|
25
|
+
|
26
|
+
if is_elsevier_wiley(self.url)
|
27
|
+
res = conn.get do |req|
|
28
|
+
req.headers['CR-Clickthrough-Client-Token'] = Textminer.tdm_key
|
29
|
+
end
|
30
|
+
else
|
31
|
+
res = conn.get
|
32
|
+
end
|
33
|
+
|
34
|
+
type = detect_type(res)
|
35
|
+
path = make_path(type)
|
36
|
+
write_disk(res, path)
|
37
|
+
|
38
|
+
return Mined.new(self.url, path, type)
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
data/lib/textminer/request.rb
CHANGED
@@ -1,19 +1,36 @@
|
|
1
1
|
module Textminer
|
2
2
|
class Request #:nodoc:
|
3
3
|
attr_accessor :doi
|
4
|
+
attr_accessor :member
|
5
|
+
attr_accessor :filter
|
6
|
+
attr_accessor :limit
|
7
|
+
attr_accessor :options
|
4
8
|
|
5
|
-
def initialize(doi)
|
9
|
+
def initialize(doi, member, filter, limit, options)
|
6
10
|
self.doi = doi
|
11
|
+
self.member = member
|
12
|
+
self.filter = filter
|
13
|
+
self.limit = limit
|
14
|
+
self.options = options
|
7
15
|
end
|
8
16
|
|
9
17
|
def perform
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
18
|
+
fac = nil
|
19
|
+
|
20
|
+
if member.nil?
|
21
|
+
res = Serrano.works(ids: doi, filter: filter, limit: limit, options: options)
|
22
|
+
if doi.nil?
|
23
|
+
fac = Serrano.works(ids: doi, filter: filter, options: options, facet: 'license:*', limit: 0)
|
24
|
+
fac = fac['message']['facets']['license']['value-count'].to_s
|
25
|
+
end
|
26
|
+
else
|
27
|
+
res = Serrano.members(ids: member, filter: filter, works: true, limit: limit, options: options)
|
28
|
+
if member.nil?
|
29
|
+
fac = Serrano.member(ids: member, filter: filter, options: options, facet: 'license:*', limit: 0)
|
30
|
+
fac = fac['message']['facets']['license']['value-count'].to_s
|
31
|
+
end
|
14
32
|
end
|
15
|
-
|
16
|
-
Response.new(self.doi, coll)
|
33
|
+
Response.new(self.doi, self.member, res, fac)
|
17
34
|
end
|
18
35
|
end
|
19
36
|
end
|
data/lib/textminer/response.rb
CHANGED
@@ -1,52 +1,76 @@
|
|
1
|
+
require 'launchy'
|
2
|
+
require "textminer/link_methods_hash"
|
3
|
+
require "textminer/link_methods_array"
|
4
|
+
|
1
5
|
module Textminer
|
2
6
|
class Response #:nodoc:
|
3
|
-
attr_reader :doi, :response
|
7
|
+
attr_reader :doi, :member, :response, :facet
|
4
8
|
|
5
|
-
def initialize(doi,
|
9
|
+
def initialize(doi, member, response, facet)
|
6
10
|
@doi = doi
|
7
|
-
@
|
11
|
+
@member = member
|
12
|
+
@response = response
|
13
|
+
@facet = facet
|
8
14
|
end
|
9
15
|
|
10
|
-
def
|
11
|
-
|
12
|
-
|
16
|
+
def to_s
|
17
|
+
if !@doi.nil?
|
18
|
+
if @doi.length > 3
|
19
|
+
ending = '...'
|
20
|
+
else
|
21
|
+
ending = ''
|
22
|
+
end
|
23
|
+
tt = sprintf('dois: %s %s', Array(@doi)[0..2].join(', '), ending)
|
24
|
+
end
|
25
|
+
if !@member.nil?
|
26
|
+
tt = 'member: ' + @member.to_s
|
27
|
+
end
|
28
|
+
if @doi.nil? && @member.nil?
|
29
|
+
tt = ''
|
30
|
+
end
|
31
|
+
sprintf("<textminer>: \n search: %s\n no. licenses: %s", tt, @facet)
|
13
32
|
end
|
14
33
|
|
15
|
-
def
|
16
|
-
|
17
|
-
@res.collect { |x| JSON.parse(x.body) }
|
34
|
+
def inspect
|
35
|
+
to_s
|
18
36
|
end
|
19
37
|
|
20
|
-
def
|
21
|
-
|
22
|
-
@res.collect { |x| x['message']['link'] }
|
38
|
+
def body
|
39
|
+
@response
|
23
40
|
end
|
24
41
|
|
25
|
-
def
|
26
|
-
tmp = links
|
27
|
-
|
28
|
-
tmp.collect { |z|
|
29
|
-
z.select{ |x| x['content-type'] == "application/pdf" }[0]['URL']
|
30
|
-
}
|
31
|
-
end
|
42
|
+
def links(just_urls = true)
|
43
|
+
tmp = @response.links(just_urls)
|
44
|
+
compactif(tmp)
|
32
45
|
end
|
33
46
|
|
34
|
-
def
|
35
|
-
tmp =
|
36
|
-
|
37
|
-
tmp.collect { |z|
|
38
|
-
z.select{ |x| x['content-type'] == "application/xml" }[0]['URL']
|
39
|
-
}
|
40
|
-
end
|
47
|
+
def links_xml(just_urls = true)
|
48
|
+
tmp = @response.links_xml(just_urls)
|
49
|
+
compactif(tmp)
|
41
50
|
end
|
42
51
|
|
43
|
-
def
|
44
|
-
|
52
|
+
def links_pdf(just_urls = true)
|
53
|
+
tmp = @response.links_pdf(just_urls)
|
54
|
+
compactif(tmp)
|
45
55
|
end
|
46
56
|
|
47
|
-
|
57
|
+
def links_plain(just_urls = true)
|
58
|
+
tmp = @response.links_plain(just_urls)
|
59
|
+
compactif(tmp)
|
60
|
+
end
|
48
61
|
|
49
|
-
|
62
|
+
protected
|
50
63
|
|
64
|
+
def compactif(z)
|
65
|
+
if z.nil?
|
66
|
+
return z
|
67
|
+
else
|
68
|
+
return z.compact
|
69
|
+
end
|
70
|
+
end
|
71
|
+
# def browse
|
72
|
+
# url = 'http://doi.org/' + @doi
|
73
|
+
# Launchy.open(url)
|
74
|
+
# end
|
51
75
|
end
|
52
76
|
end
|
data/lib/textminer/version.rb
CHANGED
data/textminer.gemspec
CHANGED
@@ -6,7 +6,7 @@ require 'textminer/version'
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = 'textminer'
|
8
8
|
s.version = Textminer::VERSION
|
9
|
-
s.date = '2015-
|
9
|
+
s.date = '2015-12-04'
|
10
10
|
s.summary = "Interact with Crossref's Text and Data mining API"
|
11
11
|
s.description = "Search Crossref's search API for full text content, and get full text content."
|
12
12
|
s.authors = "Scott Chamberlain"
|
@@ -15,7 +15,6 @@ Gem::Specification.new do |s|
|
|
15
15
|
s.licenses = 'MIT'
|
16
16
|
|
17
17
|
s.files = `git ls-files -z`.split("\x0").reject {|f| f.match(%r{^(test|spec|features)/}) }
|
18
|
-
s.test_files = ["test/test_tdm.rb"]
|
19
18
|
s.require_paths = ["lib"]
|
20
19
|
|
21
20
|
s.bindir = 'bin'
|
@@ -27,9 +26,16 @@ Gem::Specification.new do |s|
|
|
27
26
|
s.add_development_dependency "oga", '~> 1.2'
|
28
27
|
s.add_development_dependency "simplecov", '~> 0.10'
|
29
28
|
s.add_development_dependency "codecov", '~> 0.1'
|
29
|
+
|
30
|
+
s.add_runtime_dependency 'serrano', '~> 0.1.4.1'
|
30
31
|
s.add_runtime_dependency 'httparty', '~> 0.13'
|
31
32
|
s.add_runtime_dependency 'thor', '~> 0.19'
|
32
33
|
s.add_runtime_dependency 'json', '~> 1.8'
|
33
|
-
s.add_runtime_dependency '
|
34
|
+
s.add_runtime_dependency 'multi_json', '~> 1.0'
|
35
|
+
s.add_runtime_dependency 'faraday', '~> 0.9.1'
|
36
|
+
s.add_runtime_dependency 'faraday_middleware', '~> 0.10.0'
|
37
|
+
s.add_runtime_dependency 'launchy', '~> 2.4', '>= 2.4.3'
|
34
38
|
s.add_runtime_dependency 'pdf-reader','~> 1.3'
|
39
|
+
s.add_runtime_dependency 'nokogiri', '~> 1.6', '>= 1.6.6.2'
|
40
|
+
s.add_runtime_dependency 'uuidtools', '~> 2.1', '>= 2.1.5'
|
35
41
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textminer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Scott Chamberlain
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-12-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0.1'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: serrano
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: 0.1.4.1
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 0.1.4.1
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: httparty
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,6 +150,48 @@ dependencies:
|
|
136
150
|
- - "~>"
|
137
151
|
- !ruby/object:Gem::Version
|
138
152
|
version: '1.8'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: multi_json
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '1.0'
|
160
|
+
type: :runtime
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - "~>"
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '1.0'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: faraday
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
170
|
+
requirements:
|
171
|
+
- - "~>"
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: 0.9.1
|
174
|
+
type: :runtime
|
175
|
+
prerelease: false
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
177
|
+
requirements:
|
178
|
+
- - "~>"
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
version: 0.9.1
|
181
|
+
- !ruby/object:Gem::Dependency
|
182
|
+
name: faraday_middleware
|
183
|
+
requirement: !ruby/object:Gem::Requirement
|
184
|
+
requirements:
|
185
|
+
- - "~>"
|
186
|
+
- !ruby/object:Gem::Version
|
187
|
+
version: 0.10.0
|
188
|
+
type: :runtime
|
189
|
+
prerelease: false
|
190
|
+
version_requirements: !ruby/object:Gem::Requirement
|
191
|
+
requirements:
|
192
|
+
- - "~>"
|
193
|
+
- !ruby/object:Gem::Version
|
194
|
+
version: 0.10.0
|
139
195
|
- !ruby/object:Gem::Dependency
|
140
196
|
name: launchy
|
141
197
|
requirement: !ruby/object:Gem::Requirement
|
@@ -145,7 +201,7 @@ dependencies:
|
|
145
201
|
version: '2.4'
|
146
202
|
- - ">="
|
147
203
|
- !ruby/object:Gem::Version
|
148
|
-
version: 2.4.
|
204
|
+
version: 2.4.3
|
149
205
|
type: :runtime
|
150
206
|
prerelease: false
|
151
207
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -155,7 +211,7 @@ dependencies:
|
|
155
211
|
version: '2.4'
|
156
212
|
- - ">="
|
157
213
|
- !ruby/object:Gem::Version
|
158
|
-
version: 2.4.
|
214
|
+
version: 2.4.3
|
159
215
|
- !ruby/object:Gem::Dependency
|
160
216
|
name: pdf-reader
|
161
217
|
requirement: !ruby/object:Gem::Requirement
|
@@ -170,6 +226,46 @@ dependencies:
|
|
170
226
|
- - "~>"
|
171
227
|
- !ruby/object:Gem::Version
|
172
228
|
version: '1.3'
|
229
|
+
- !ruby/object:Gem::Dependency
|
230
|
+
name: nokogiri
|
231
|
+
requirement: !ruby/object:Gem::Requirement
|
232
|
+
requirements:
|
233
|
+
- - "~>"
|
234
|
+
- !ruby/object:Gem::Version
|
235
|
+
version: '1.6'
|
236
|
+
- - ">="
|
237
|
+
- !ruby/object:Gem::Version
|
238
|
+
version: 1.6.6.2
|
239
|
+
type: :runtime
|
240
|
+
prerelease: false
|
241
|
+
version_requirements: !ruby/object:Gem::Requirement
|
242
|
+
requirements:
|
243
|
+
- - "~>"
|
244
|
+
- !ruby/object:Gem::Version
|
245
|
+
version: '1.6'
|
246
|
+
- - ">="
|
247
|
+
- !ruby/object:Gem::Version
|
248
|
+
version: 1.6.6.2
|
249
|
+
- !ruby/object:Gem::Dependency
|
250
|
+
name: uuidtools
|
251
|
+
requirement: !ruby/object:Gem::Requirement
|
252
|
+
requirements:
|
253
|
+
- - "~>"
|
254
|
+
- !ruby/object:Gem::Version
|
255
|
+
version: '2.1'
|
256
|
+
- - ">="
|
257
|
+
- !ruby/object:Gem::Version
|
258
|
+
version: 2.1.5
|
259
|
+
type: :runtime
|
260
|
+
prerelease: false
|
261
|
+
version_requirements: !ruby/object:Gem::Requirement
|
262
|
+
requirements:
|
263
|
+
- - "~>"
|
264
|
+
- !ruby/object:Gem::Version
|
265
|
+
version: '2.1'
|
266
|
+
- - ">="
|
267
|
+
- !ruby/object:Gem::Version
|
268
|
+
version: 2.1.5
|
173
269
|
description: Search Crossref's search API for full text content, and get full text
|
174
270
|
content.
|
175
271
|
email: myrmecocystus@gmail.com
|
@@ -180,18 +276,25 @@ extra_rdoc_files: []
|
|
180
276
|
files:
|
181
277
|
- ".gitignore"
|
182
278
|
- ".travis.yml"
|
279
|
+
- CHANGELOG.md
|
183
280
|
- Gemfile
|
184
281
|
- Gemfile.lock
|
185
|
-
- NEWS.md
|
186
282
|
- README.md
|
187
283
|
- Rakefile
|
188
284
|
- bin/tm
|
285
|
+
- extra/fetch.rb
|
286
|
+
- extra/fetch_method.rb
|
189
287
|
- lib/textminer.rb
|
190
|
-
- lib/textminer/
|
288
|
+
- lib/textminer/helpers/configuration.rb
|
289
|
+
- lib/textminer/link_methods_array.rb
|
290
|
+
- lib/textminer/link_methods_hash.rb
|
291
|
+
- lib/textminer/mine_utils.rb
|
292
|
+
- lib/textminer/mined.rb
|
293
|
+
- lib/textminer/miner.rb
|
191
294
|
- lib/textminer/request.rb
|
192
295
|
- lib/textminer/response.rb
|
296
|
+
- lib/textminer/tmutils.rb
|
193
297
|
- lib/textminer/version.rb
|
194
|
-
- test/test_tdm.rb
|
195
298
|
- textminer.gemspec
|
196
299
|
homepage: http://github.com/sckott/textminer
|
197
300
|
licenses:
|
@@ -213,10 +316,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
213
316
|
version: '0'
|
214
317
|
requirements: []
|
215
318
|
rubyforge_project:
|
216
|
-
rubygems_version: 2.4.5
|
319
|
+
rubygems_version: 2.4.5.1
|
217
320
|
signing_key:
|
218
321
|
specification_version: 4
|
219
322
|
summary: Interact with Crossref's Text and Data mining API
|
220
|
-
test_files:
|
221
|
-
- test/test_tdm.rb
|
323
|
+
test_files: []
|
222
324
|
has_rdoc:
|
data/NEWS.md
DELETED
data/test/test_tdm.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
require 'simplecov'
|
2
|
-
SimpleCov.start
|
3
|
-
if ENV['CI']=='true'
|
4
|
-
require 'codecov'
|
5
|
-
SimpleCov.formatter = SimpleCov::Formatter::Codecov
|
6
|
-
end
|
7
|
-
|
8
|
-
require "textminer"
|
9
|
-
require 'fileutils'
|
10
|
-
require "test/unit"
|
11
|
-
require "oga"
|
12
|
-
|
13
|
-
class TestResponse < Test::Unit::TestCase
|
14
|
-
|
15
|
-
def setup
|
16
|
-
@doi = '10.5555/515151'
|
17
|
-
@doi2 = "10.3897/phytokeys.42.7604"
|
18
|
-
@pdf = ["http://annalsofpsychoceramics.labs.crossref.org/fulltext/10.5555/515151.pdf"]
|
19
|
-
@xml = ["http://annalsofpsychoceramics.labs.crossref.org/fulltext/10.5555/515151.xml"]
|
20
|
-
end
|
21
|
-
|
22
|
-
def test_links_endpoint
|
23
|
-
assert_equal(Textminer::Response, Textminer.links(@doi).class)
|
24
|
-
end
|
25
|
-
|
26
|
-
def test_doi
|
27
|
-
assert_equal(@doi, Textminer.links(@doi).doi)
|
28
|
-
end
|
29
|
-
|
30
|
-
def test_pdf
|
31
|
-
assert_equal(@pdf, Textminer.links(@doi).pdf)
|
32
|
-
end
|
33
|
-
|
34
|
-
def test_xml
|
35
|
-
assert_equal(@xml, Textminer.links(@doi).xml)
|
36
|
-
end
|
37
|
-
|
38
|
-
def test_fetch_xml
|
39
|
-
res = Textminer.fetch(@doi2, "xml")
|
40
|
-
assert_equal(HTTParty::Response, res[0].class)
|
41
|
-
assert_true(res[0].ok?)
|
42
|
-
assert_equal(String, res[0].body.class)
|
43
|
-
assert_equal("PhytoKeys", Oga.parse_xml(res[0].body).xpath('//journal-meta//journal-id').text)
|
44
|
-
end
|
45
|
-
|
46
|
-
# def test_fetch_pdf
|
47
|
-
# res = Textminer.fetch(@doi2, "pdf")
|
48
|
-
# assert_equal(HTTParty::Response, res.class)
|
49
|
-
# assert_true(res.ok?)
|
50
|
-
# end
|
51
|
-
|
52
|
-
end
|