textminer 0.1.0 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +0 -1
- data/CHANGELOG.md +9 -0
- data/Gemfile.lock +27 -3
- data/README.md +61 -58
- data/Rakefile +17 -2
- data/bin/tm +1 -1
- data/{lib/textminer → extra}/fetch.rb +0 -0
- data/extra/fetch_method.rb +17 -0
- data/lib/textminer.rb +120 -26
- data/lib/textminer/helpers/configuration.rb +26 -0
- data/lib/textminer/link_methods_array.rb +54 -0
- data/lib/textminer/link_methods_hash.rb +71 -0
- data/lib/textminer/mine_utils.rb +65 -0
- data/lib/textminer/mined.rb +31 -0
- data/lib/textminer/miner.rb +42 -0
- data/lib/textminer/request.rb +24 -7
- data/lib/textminer/response.rb +54 -30
- data/lib/textminer/tmutils.rb +7 -0
- data/lib/textminer/version.rb +1 -1
- data/textminer.gemspec +9 -3
- metadata +112 -10
- data/NEWS.md +0 -3
- data/test/test_tdm.rb +0 -52
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c6c80a22022bb38bc141dc50e8da5d913db03946
|
4
|
+
data.tar.gz: 957cf24214f95f1b2d8309f2fd1a2e2aa7b6ca69
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9837bd893866ef35e420d928bf02f3151783b345d39f758ed5ddce8b98c6df92147ff518b889d5cab33f84aa62ad795a3e7e1e2c6ad18cfd7a9a3060589293eb
|
7
|
+
data.tar.gz: 1151759369e8007f85ad73f24872f409ffcb70e99ad114e7a48e623c48a53ea118c7ed13a4ded171fb64823a2e21e342000f77766aaac93b12493335ace58f1d
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
## 0.1.5 (2015-12-04)
|
2
|
+
|
3
|
+
* Now using `serrano` gem for interacting with the Crossref API
|
4
|
+
* Changed `links` method to `search`
|
5
|
+
* Changed `fetch` method to accept a URL for a full text article instead of a DOI
|
6
|
+
|
7
|
+
## 0.1.0 (2015-08-24)
|
8
|
+
|
9
|
+
* First version to Rubygems
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,18 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
textminer (0.1.
|
4
|
+
textminer (0.1.5)
|
5
|
+
faraday (~> 0.9.1)
|
6
|
+
faraday_middleware (~> 0.10.0)
|
5
7
|
httparty (~> 0.13)
|
6
8
|
json (~> 1.8)
|
7
|
-
launchy (~> 2.4, >= 2.4.
|
9
|
+
launchy (~> 2.4, >= 2.4.3)
|
10
|
+
multi_json (~> 1.0)
|
11
|
+
nokogiri (~> 1.6, >= 1.6.6.2)
|
8
12
|
pdf-reader (~> 1.3)
|
13
|
+
serrano (~> 0.1.4.1)
|
9
14
|
thor (~> 0.19)
|
15
|
+
uuidtools (~> 2.1, >= 2.1.5)
|
10
16
|
|
11
17
|
GEM
|
12
18
|
remote: https://rubygems.org/
|
@@ -21,14 +27,23 @@ GEM
|
|
21
27
|
simplecov
|
22
28
|
url
|
23
29
|
docile (1.1.5)
|
30
|
+
faraday (0.9.1)
|
31
|
+
multipart-post (>= 1.2, < 3)
|
32
|
+
faraday_middleware (0.10.0)
|
33
|
+
faraday (>= 0.7.4, < 0.10)
|
24
34
|
hashery (2.1.1)
|
25
|
-
httparty (0.13.
|
35
|
+
httparty (0.13.7)
|
26
36
|
json (~> 1.8)
|
27
37
|
multi_xml (>= 0.5.2)
|
28
38
|
json (1.8.3)
|
29
39
|
launchy (2.4.3)
|
30
40
|
addressable (~> 2.3)
|
41
|
+
mini_portile (0.6.2)
|
42
|
+
multi_json (1.11.2)
|
31
43
|
multi_xml (0.5.5)
|
44
|
+
multipart-post (2.0.0)
|
45
|
+
nokogiri (1.6.6.2)
|
46
|
+
mini_portile (~> 0.6.0)
|
32
47
|
oga (1.2.3)
|
33
48
|
ast
|
34
49
|
ruby-ll (~> 2.1)
|
@@ -44,6 +59,11 @@ GEM
|
|
44
59
|
ansi
|
45
60
|
ast
|
46
61
|
ruby-rc4 (0.1.5)
|
62
|
+
serrano (0.1.4.1)
|
63
|
+
faraday (~> 0.9.1)
|
64
|
+
faraday_middleware (~> 0.10.0)
|
65
|
+
multi_json (~> 1.0)
|
66
|
+
thor (~> 0.19)
|
47
67
|
simplecov (0.10.0)
|
48
68
|
docile (~> 1.1.0)
|
49
69
|
json (~> 1.8)
|
@@ -54,6 +74,7 @@ GEM
|
|
54
74
|
thor (0.19.1)
|
55
75
|
ttfunk (1.4.0)
|
56
76
|
url (0.3.2)
|
77
|
+
uuidtools (2.1.5)
|
57
78
|
|
58
79
|
PLATFORMS
|
59
80
|
ruby
|
@@ -66,3 +87,6 @@ DEPENDENCIES
|
|
66
87
|
simplecov (~> 0.10)
|
67
88
|
test-unit (~> 3.1)
|
68
89
|
textminer!
|
90
|
+
|
91
|
+
BUNDLED WITH
|
92
|
+
1.10.6
|
data/README.md
CHANGED
@@ -1,24 +1,29 @@
|
|
1
1
|
textminer
|
2
2
|
=========
|
3
3
|
|
4
|
-
[](https://rubygems.org/gems/textminer)
|
5
|
+
[](https://travis-ci.org/sckott/textminer)
|
5
6
|
[](http://codecov.io/github/sckott/textminer?branch=master)
|
6
7
|
|
7
|
-
__This is alpha software, so expect changes__
|
8
|
-
|
9
|
-
## What is it?
|
10
|
-
|
11
8
|
__`textminer` helps you text mine through Crossref's TDM (Text & Data Mining) services:__
|
12
9
|
|
13
10
|
## Changes
|
14
11
|
|
15
|
-
For changes see the [
|
12
|
+
For changes see the [CHANGELOG][changelog]
|
13
|
+
|
14
|
+
## gem API
|
15
|
+
|
16
|
+
* `Textiner.search` - search by DOI, query string, filters, etc. to get Crossref metadata, which you can use downstream to get full text links. This method essentially wraps `Serrano.works()`, but only a subset of params - this interface may change depending on feedback.
|
17
|
+
* `Textiner.fetch` - Fetch full text given a url, supports Crossref's Text and Data Mining service
|
18
|
+
* `Textiner.extract` - Extract text from a pdf
|
16
19
|
|
17
20
|
## Install
|
18
21
|
|
19
22
|
### Release version
|
20
23
|
|
21
|
-
|
24
|
+
```
|
25
|
+
gem install textminer
|
26
|
+
```
|
22
27
|
|
23
28
|
### Development version
|
24
29
|
|
@@ -28,89 +33,87 @@ cd textminer
|
|
28
33
|
rake install
|
29
34
|
```
|
30
35
|
|
31
|
-
##
|
36
|
+
## Examples
|
37
|
+
|
38
|
+
### Within Ruby
|
39
|
+
|
40
|
+
#### Search
|
32
41
|
|
33
42
|
Search by DOI
|
34
43
|
|
35
44
|
```ruby
|
36
45
|
require 'textminer'
|
37
|
-
|
46
|
+
# link to full text available
|
47
|
+
Textminer.search(doi: '10.7554/elife.06430')
|
48
|
+
# no link to full text available
|
49
|
+
Textminer.search(doi: "10.1371/journal.pone.0000308")
|
38
50
|
```
|
39
51
|
|
40
|
-
|
41
|
-
|
42
|
-
```ruby
|
43
|
-
out.pdf
|
44
|
-
```
|
52
|
+
Many DOIs at once
|
45
53
|
|
46
54
|
```ruby
|
47
|
-
|
55
|
+
require 'serrano'
|
56
|
+
dois = Serrano.random_dois(sample: 6)
|
57
|
+
Textminer.search(doi: dois)
|
48
58
|
```
|
49
59
|
|
50
|
-
|
60
|
+
Search with filters
|
51
61
|
|
52
62
|
```ruby
|
53
|
-
|
63
|
+
Textminer.search(filter: {has_full_text: true})
|
54
64
|
```
|
55
65
|
|
56
|
-
|
57
|
-
"http://annalsofpsychoceramics.labs.crossref.org/fulltext/10.5555/515151.xml"
|
58
|
-
```
|
66
|
+
#### Get full text links
|
59
67
|
|
60
|
-
|
68
|
+
The object returned form `Textminer.search` is a class, which has methods for pulling out all links, xml only, pdf only, or plain text only
|
61
69
|
|
62
70
|
```ruby
|
63
|
-
Textminer.
|
71
|
+
x = Textminer.search(filter: {has_full_text: true})
|
72
|
+
x.links_xml
|
73
|
+
x.links_pdf
|
74
|
+
x.links_plain
|
64
75
|
```
|
65
76
|
|
66
|
-
|
67
|
-
=> {"article"=>
|
68
|
-
{"front"=>
|
69
|
-
{"journal_meta"=>
|
70
|
-
{"journal_id"=>
|
71
|
-
{"__content__"=>"PhytoKeys", "journal_id_type"=>"publisher-id"},
|
72
|
-
"journal_title_group"=>
|
73
|
-
{"journal_title"=>{"__content__"=>"PhytoKeys", "lang"=>"en"},
|
74
|
-
"abbrev_journal_title"=>{"__content__"=>"PhytoKeys", "lang"=>"en"}},
|
75
|
-
"issn"=>
|
76
|
-
[{"__content__"=>"1314-2011", "pub_type"=>"ppub"},
|
77
|
-
{"__content__"=>"1314-2003", "pub_type"=>"epub"}],
|
78
|
-
"publisher"=>{"publisher_name"=>"Pensoft Publishers"}},
|
79
|
-
"article_meta"=>
|
80
|
-
|
81
|
-
...
|
82
|
-
```
|
77
|
+
#### Fetch full text
|
83
78
|
|
84
|
-
|
79
|
+
`Textminer.fetch()` gets full text based on URL input. We determine how to pull down and parse the content based on content type.
|
85
80
|
|
86
81
|
```ruby
|
87
|
-
|
82
|
+
# get some metadata
|
83
|
+
res = Textminer.search(member: 2258, filter: {has_full_text: true});
|
84
|
+
# get links
|
85
|
+
links = res.links_xml(true);
|
86
|
+
# Get full text for an article
|
87
|
+
res = Textminer.fetch(url: links[0]);
|
88
|
+
# url
|
89
|
+
res.url
|
90
|
+
# file path
|
91
|
+
res.path
|
92
|
+
# content type
|
93
|
+
res.type
|
94
|
+
# parse content
|
95
|
+
res.parse
|
88
96
|
```
|
89
97
|
|
90
|
-
|
91
|
-
|
92
|
-
## On the CLI
|
98
|
+
#### Extract text from PDF
|
93
99
|
|
94
|
-
|
100
|
+
`Textminer.extract()` extracts text from a pdf, given a path for a pdf
|
95
101
|
|
96
|
-
```
|
97
|
-
|
102
|
+
```ruby
|
103
|
+
res = Textminer.search(member: 2258, filter: {has_full_text: true});
|
104
|
+
links = res.links_pdf(true);
|
105
|
+
res = Textminer.fetch(url: links[0]);
|
106
|
+
Textminer.extract(res.path)
|
98
107
|
```
|
99
108
|
|
100
|
-
|
101
|
-
http://phytokeys.pensoft.net/lib/ajax_srv/article_elements_srv.php?action=download_xml&item_id=4190
|
102
|
-
http://phytokeys.pensoft.net/lib/ajax_srv/article_elements_srv.php?action=download_pdf&item_id=4190
|
103
|
-
```
|
109
|
+
### On the CLI
|
104
110
|
|
105
|
-
|
106
|
-
|
107
|
-
```sh
|
108
|
-
tm links '10.3897/phytokeys.42.7604,10.3897/zookeys.516.9439'
|
109
|
-
```
|
111
|
+
Coming soon...
|
110
112
|
|
111
113
|
## To do
|
112
114
|
|
113
115
|
* CLI executable
|
114
|
-
* get actual full text
|
115
116
|
* better test suite
|
116
|
-
* documentation
|
117
|
+
* better documentation
|
118
|
+
|
119
|
+
[changelog]: https://github.com/sckott/textminer/blob/master/CHANGELOG.md
|
data/Rakefile
CHANGED
@@ -3,20 +3,35 @@ require 'rake/testtask'
|
|
3
3
|
|
4
4
|
Rake::TestTask.new do |t|
|
5
5
|
t.libs << "test"
|
6
|
-
t.test_files = FileList['test/test
|
6
|
+
t.test_files = FileList['test/test-*.rb']
|
7
7
|
t.verbose = true
|
8
8
|
end
|
9
9
|
|
10
10
|
desc "Run tests"
|
11
11
|
task :default => :test
|
12
12
|
|
13
|
+
desc "Build textminer docs"
|
14
|
+
task :docs do
|
15
|
+
system "yardoc"
|
16
|
+
end
|
17
|
+
|
18
|
+
desc "bundle install"
|
19
|
+
task :bundle do
|
20
|
+
system "bundle install"
|
21
|
+
end
|
22
|
+
|
23
|
+
desc "clean out builds"
|
24
|
+
task :clean do
|
25
|
+
system "ls | grep [0-9].gem | xargs rm"
|
26
|
+
end
|
27
|
+
|
13
28
|
desc "Build textminer"
|
14
29
|
task :build do
|
15
30
|
system "gem build textminer.gemspec"
|
16
31
|
end
|
17
32
|
|
18
33
|
desc "Install textminer"
|
19
|
-
task :install => :build do
|
34
|
+
task :install => [:bundle, :build] do
|
20
35
|
system "gem install textminer-#{Textminer::VERSION}.gem"
|
21
36
|
end
|
22
37
|
|
data/bin/tm
CHANGED
File without changes
|
@@ -0,0 +1,17 @@
|
|
1
|
+
##
|
2
|
+
# Thin layer around pdf-reader gem's PDF::Reader
|
3
|
+
#
|
4
|
+
# @param doi [Array] A DOI, digital object identifier
|
5
|
+
# @param type [Array] One of two options to download: xml (default) or pdf
|
6
|
+
#
|
7
|
+
# @example
|
8
|
+
# require 'textminer'
|
9
|
+
# # fetch full text by DOI - xml by default
|
10
|
+
# Textminer.fetch("10.3897/phytokeys.42.7604")
|
11
|
+
# # many DOIs - xml output
|
12
|
+
# res = Textminer.fetch(["10.3897/phytokeys.42.7604", "10.3897/zookeys.516.9439"])
|
13
|
+
# # fetch full text - pdf
|
14
|
+
# Textminer.fetch("10.3897/phytokeys.42.7604", "pdf")
|
15
|
+
def self.fetch(doi, type = 'xml')
|
16
|
+
Fetch.new(doi, type).fetchtext
|
17
|
+
end
|
data/lib/textminer.rb
CHANGED
@@ -1,49 +1,124 @@
|
|
1
1
|
require 'httparty'
|
2
2
|
require 'json'
|
3
3
|
require 'pdf-reader'
|
4
|
+
require 'serrano'
|
5
|
+
require "textminer/miner"
|
4
6
|
require "textminer/version"
|
5
7
|
require "textminer/request"
|
6
8
|
require "textminer/response"
|
7
|
-
require "textminer/fetch"
|
8
9
|
|
9
10
|
module Textminer
|
11
|
+
extend Configuration
|
12
|
+
|
13
|
+
define_setting :tdm_key
|
14
|
+
|
10
15
|
##
|
11
|
-
#
|
16
|
+
# Search for papers and get full text links
|
12
17
|
#
|
13
18
|
# @param doi [Array] A DOI, digital object identifier
|
19
|
+
# @param options [Array] Curl request options
|
14
20
|
# @return [Array] the output
|
15
21
|
#
|
16
22
|
# @example
|
17
23
|
# require 'textminer'
|
18
24
|
# # link to full text available
|
19
|
-
# Textminer.
|
25
|
+
# Textminer.search(doi: '10.3897/phytokeys.42.7604')
|
20
26
|
# # no link to full text available
|
21
|
-
# Textminer.
|
27
|
+
# Textminer.search(doi: "10.1371/journal.pone.0000308")
|
22
28
|
# # many DOIs at once
|
23
|
-
#
|
29
|
+
# require 'serrano'
|
30
|
+
# dois = Serrano.random_dois(sample: 6)
|
31
|
+
# res = Textminer.search(doi: dois)
|
32
|
+
# res = Textminer.search(doi: ["10.3897/phytokeys.42.7604", "10.3897/zookeys.516.9439"])
|
24
33
|
# res.links
|
25
|
-
# res.
|
26
|
-
# res.
|
27
|
-
|
28
|
-
|
34
|
+
# res.links_pdf
|
35
|
+
# res.links_xml
|
36
|
+
# res.links_plain
|
37
|
+
# # only full text available
|
38
|
+
# x = Textminer.search(doi: '10.3816/clm.2001.n.006')
|
39
|
+
# x.links_xml
|
40
|
+
# x.links_plain
|
41
|
+
# x.links_pdf
|
42
|
+
# # no dois
|
43
|
+
# x = Textminer.search(filter: {has_full_text: true})
|
44
|
+
# x.links_xml
|
45
|
+
# x.links_plain
|
46
|
+
# x = Textminer.search(member: 311, filter: {has_full_text: true})
|
47
|
+
# x.links_pdf
|
48
|
+
def self.search(doi: nil, member: nil, filter: nil, limit: nil, options: nil)
|
49
|
+
Request.new(doi, member, filter, limit, options).perform
|
29
50
|
end
|
30
51
|
|
31
52
|
##
|
32
|
-
#
|
53
|
+
# Get full text
|
33
54
|
#
|
34
|
-
#
|
35
|
-
#
|
55
|
+
# Work easily for open access papers, but for closed. For non-OA papers, use
|
56
|
+
# Crossref's Text and Data Mining service, which requires authentication and
|
57
|
+
# pre-authorized IP address. Go to https://apps.crossref.org/clickthrough/researchers
|
58
|
+
# to sign up for the TDM service, to get your key. The only publishers
|
59
|
+
# taking part at this time are Elsevier and Wiley.
|
60
|
+
#
|
61
|
+
# @param url [String] A url for full text
|
62
|
+
# @return [Mined] An object of class Mined, with methods for extracting
|
63
|
+
# the url requested, the file path, and parsing the plain text, XML, or extracting
|
64
|
+
# text from the pdf.
|
36
65
|
#
|
37
66
|
# @example
|
38
|
-
#
|
39
|
-
#
|
40
|
-
#
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
|
46
|
-
|
67
|
+
# require 'textminer'
|
68
|
+
# # Set authorization
|
69
|
+
# Textminer.configuration do |config|
|
70
|
+
# config.tdm_key = "<your key>"
|
71
|
+
# end
|
72
|
+
# # Get some elsevier works
|
73
|
+
# res = Textminer.search(member: 78, filter: {has_full_text: true});
|
74
|
+
# links = res.links_xml(true);
|
75
|
+
# # Get full text for an article
|
76
|
+
# out = Textminer.fetch(url: links[0]);
|
77
|
+
# out.url
|
78
|
+
# out.path
|
79
|
+
# out.type
|
80
|
+
# xml = out.parse()
|
81
|
+
# puts xml
|
82
|
+
# xml.xpath('//xocs:cover-date-text', xml.root.namespaces).text
|
83
|
+
# # Get lots of articles
|
84
|
+
# links = links[1..3]
|
85
|
+
# out = links.collect{ |x| Textminer.fetch(url: x) }
|
86
|
+
# out.collect{ |z| z.path }
|
87
|
+
# out.collect{ |z| z.parse }
|
88
|
+
# zz = out[0].parse
|
89
|
+
# zz.xpath('//xocs:cover-date-text', zz.root.namespaces).text
|
90
|
+
#
|
91
|
+
# ## plain text
|
92
|
+
# # get full text links, here doing xml
|
93
|
+
# links = res.links_plain(true);
|
94
|
+
# # Get full text for an article
|
95
|
+
# res = Textminer.fetch(url: links[0]);
|
96
|
+
# res.url
|
97
|
+
# res.parse
|
98
|
+
#
|
99
|
+
# # With open access content - using Pensoft
|
100
|
+
# res = Textminer.search(member: 2258, filter: {has_full_text: true});
|
101
|
+
# links = res.links_xml(true);
|
102
|
+
# # Get full text for an article
|
103
|
+
# res = Textminer.fetch(url: links[0]);
|
104
|
+
# res.url
|
105
|
+
# res.parse
|
106
|
+
#
|
107
|
+
# # OA content - pdfs, using pensoft again
|
108
|
+
# res = Textminer.search(member: 2258, filter: {has_full_text: true});
|
109
|
+
# links = res.links_pdf(true);
|
110
|
+
# # Get full text for an article
|
111
|
+
# res = Textminer.fetch(url: links[0]);
|
112
|
+
# # url used
|
113
|
+
# res.url
|
114
|
+
# # document type
|
115
|
+
# res.type
|
116
|
+
# # document path on your machine
|
117
|
+
# res.path
|
118
|
+
# # get text
|
119
|
+
# res.parse
|
120
|
+
def self.fetch(url)
|
121
|
+
Miner.new(url).perform
|
47
122
|
end
|
48
123
|
|
49
124
|
##
|
@@ -52,15 +127,34 @@ module Textminer
|
|
52
127
|
# @param path [String] Path to a pdf file downloaded via {fetch}, or
|
53
128
|
# another way.
|
54
129
|
#
|
130
|
+
# This method is used internally within fetch to parse PDFs.
|
131
|
+
#
|
55
132
|
# @example
|
56
|
-
#
|
57
|
-
#
|
58
|
-
#
|
59
|
-
#
|
60
|
-
#
|
133
|
+
# require 'textminer'
|
134
|
+
# res = Textminer.search(member: 2258, filter: {has_full_text: true});
|
135
|
+
# links = res.links_pdf(true);
|
136
|
+
# # Get full text for an article
|
137
|
+
# out = Textminer.fetch(url: links[0]);
|
138
|
+
# # extract pdf to text
|
139
|
+
# Textminer.extract(out.path)
|
61
140
|
def self.extract(path)
|
62
141
|
rr = PDF::Reader.new(path)
|
63
142
|
rr.pages.map { |page| page.text }.join("\n")
|
64
143
|
end
|
65
144
|
|
145
|
+
protected
|
146
|
+
|
147
|
+
def self.link_switch(x, y)
|
148
|
+
case y
|
149
|
+
when nil
|
150
|
+
x.links
|
151
|
+
when 'xml'
|
152
|
+
x.links_xml
|
153
|
+
when 'pdf'
|
154
|
+
x.links_pdf
|
155
|
+
when 'plain'
|
156
|
+
x.links_plain
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
66
160
|
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# taken from: https://viget.com/extend/easy-gem-configuration-variables-with-defaults
|
2
|
+
module Configuration
|
3
|
+
|
4
|
+
def configuration
|
5
|
+
yield self
|
6
|
+
end
|
7
|
+
|
8
|
+
def define_setting(name, default = nil)
|
9
|
+
class_variable_set("@@#{name}", default)
|
10
|
+
define_class_method "#{name}=" do |value|
|
11
|
+
class_variable_set("@@#{name}", value)
|
12
|
+
end
|
13
|
+
define_class_method name do
|
14
|
+
class_variable_get("@@#{name}")
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def define_class_method(name, &block)
|
21
|
+
(class << self; self; end).instance_eval do
|
22
|
+
define_method name, &block
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# Array methods
|
2
|
+
class Array
|
3
|
+
def links(just_urls = true)
|
4
|
+
return self.collect{ |x| x.links(just_urls) }.flatten
|
5
|
+
# if temp.length == 1
|
6
|
+
# return tmp[0]
|
7
|
+
# else
|
8
|
+
# return tmp
|
9
|
+
# end
|
10
|
+
# tmp = self.collect{ |x| x['message']['link'] }
|
11
|
+
# return parse_link(tmp, just_urls)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class Array
|
16
|
+
def links_xml(just_urls = true)
|
17
|
+
self.collect { |z| z.links_xml(just_urls) }[0]
|
18
|
+
# return parse_link(self.collect { |z| z.links_xml }[0], just_urls)
|
19
|
+
# return parse_link(pull_link(self, '^application\/xml$|^text\/xml$'), just_urls)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class Array
|
24
|
+
def links_pdf(just_urls = true)
|
25
|
+
self.collect { |z| z.links_pdf(just_urls) }[0]
|
26
|
+
# return parse_link(self.collect { |z| z.links_pdf }[0], just_urls)
|
27
|
+
# return parse_link(pull_link(self, '^application\/pdf$'), just_urls)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class Array
|
32
|
+
def links_plain(just_urls = true)
|
33
|
+
self.collect { |z| z.links_plain(just_urls) }[0]
|
34
|
+
# return parse_link(self.collect { |z| z.links_plain }[0], just_urls)
|
35
|
+
# return parse_link(pull_link(self, '^application\/plain$|^text\/plain$'), just_urls)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# def pull_link(x, y)
|
40
|
+
# return x.collect { |z| z.links_xml }[0]
|
41
|
+
# # return x.collect { |z| z['message']['link'] }.compact.collect { |z| z.compact.select { |w| w['content-type'].match(/#{y}/) } }
|
42
|
+
# end
|
43
|
+
|
44
|
+
# def parse_link(x, just_urls)
|
45
|
+
# if x.nil?
|
46
|
+
# return x
|
47
|
+
# else
|
48
|
+
# if just_urls
|
49
|
+
# return x.compact.collect { |z| z.collect{ |y| y['URL'] }}.flatten
|
50
|
+
# else
|
51
|
+
# return x
|
52
|
+
# end
|
53
|
+
# end
|
54
|
+
# end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# Hash methods
|
2
|
+
class Hash
|
3
|
+
def links(just_urls = true)
|
4
|
+
if self['message']['items'].nil?
|
5
|
+
tmp = self['message']['link']
|
6
|
+
if tmp.nil?
|
7
|
+
tmp = nil
|
8
|
+
else
|
9
|
+
tmp = tmp.reject { |c| c.empty? }
|
10
|
+
end
|
11
|
+
else
|
12
|
+
tmp = self['message']['items'].collect { |x| x['link'] }.reject { |c| c.empty? }
|
13
|
+
end
|
14
|
+
|
15
|
+
return parse_links(tmp, just_urls)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class Hash
|
20
|
+
def links_xml(just_urls = true)
|
21
|
+
return parse_links(pull_links(self, '^application\/xml$|^text\/xml$'), just_urls)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class Hash
|
26
|
+
def links_pdf(just_urls = true)
|
27
|
+
return parse_links(pull_links(self, '^application\/pdf$'), just_urls)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class Hash
|
32
|
+
def links_plain(just_urls = true)
|
33
|
+
return parse_links(pull_links(self, '^application\/plain$|^text\/plain$'), just_urls)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def pull_links(x, y)
|
38
|
+
if x['message']['items'].nil?
|
39
|
+
tmp = self['message']['link']
|
40
|
+
if tmp.nil?
|
41
|
+
return nil
|
42
|
+
else
|
43
|
+
return tmp.select { |z| z['content-type'].match(/#{y}/) }.reject { |c| c.empty? }
|
44
|
+
end
|
45
|
+
else
|
46
|
+
return x['message']['items'].collect { |x| x['link'].select { |z| z['content-type'].match(/#{y}/) } }.reject { |c| c.empty? }
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def parse_links(x, just_urls)
|
51
|
+
if x.nil?
|
52
|
+
return nil
|
53
|
+
else
|
54
|
+
if x.empty?
|
55
|
+
return x
|
56
|
+
else
|
57
|
+
if just_urls
|
58
|
+
if x[0].class != Array
|
59
|
+
# return x[0]['URL']
|
60
|
+
return x.collect { |x| x['URL'] }.flatten
|
61
|
+
else
|
62
|
+
return x.collect { |x| x.collect { |z| z['URL'] }}.flatten
|
63
|
+
# return x.collect { |x| x['URL'] }.flatten.compact
|
64
|
+
# return x.collect { |x| x.collect { |z| z['URL'] }}.flatten
|
65
|
+
end
|
66
|
+
else
|
67
|
+
return x
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'uuidtools'
|
3
|
+
|
4
|
+
def detect_type(x)
|
5
|
+
ctype = x.headers['content-type']
|
6
|
+
case ctype
|
7
|
+
when 'text/xml'
|
8
|
+
'xml'
|
9
|
+
when 'text/plain'
|
10
|
+
'plain'
|
11
|
+
when 'application/pdf'
|
12
|
+
'pdf'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def make_ext(x)
|
17
|
+
case x
|
18
|
+
when 'xml'
|
19
|
+
'xml'
|
20
|
+
when 'plain'
|
21
|
+
'txt'
|
22
|
+
when 'pdf'
|
23
|
+
'pdf'
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def make_path(type)
|
28
|
+
# id = x.split('article/')[1].split('?')[0]
|
29
|
+
# path = id + '.' + type
|
30
|
+
# return path
|
31
|
+
type = make_ext(type)
|
32
|
+
uuid = UUIDTools::UUID.random_create.to_s
|
33
|
+
path = uuid + '.' + type
|
34
|
+
return path
|
35
|
+
end
|
36
|
+
|
37
|
+
def write_disk(res, path)
|
38
|
+
f = File.new(path, "wb")
|
39
|
+
f.write(res.body)
|
40
|
+
f.close()
|
41
|
+
end
|
42
|
+
|
43
|
+
def read_disk(path)
|
44
|
+
return File.read(path)
|
45
|
+
end
|
46
|
+
|
47
|
+
def parse_xml(x)
|
48
|
+
text = read_disk(x)
|
49
|
+
xml = Nokogiri.parse(text)
|
50
|
+
return xml
|
51
|
+
end
|
52
|
+
|
53
|
+
def parse_plain(x)
|
54
|
+
text = read_disk(x)
|
55
|
+
return text
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse_pdf(x)
|
59
|
+
return Textminer.extract(x)
|
60
|
+
end
|
61
|
+
|
62
|
+
def is_elsevier_wiley(x)
|
63
|
+
tmp = x.match 'elsevier|wiley'
|
64
|
+
!tmp.nil?
|
65
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
|
3
|
+
##
|
4
|
+
# Textminer::Mined
|
5
|
+
#
|
6
|
+
# Class to give back text mining object
|
7
|
+
module Textminer
|
8
|
+
class Mined #:nodoc:
|
9
|
+
attr_accessor :url
|
10
|
+
attr_accessor :path
|
11
|
+
attr_accessor :type
|
12
|
+
|
13
|
+
def initialize(url, path, type)
|
14
|
+
self.url = url
|
15
|
+
self.path = path
|
16
|
+
self.type = type
|
17
|
+
end
|
18
|
+
|
19
|
+
def parse
|
20
|
+
case self.type
|
21
|
+
when 'xml'
|
22
|
+
parse_xml(self.path)
|
23
|
+
when 'plain'
|
24
|
+
parse_plain(self.path)
|
25
|
+
when 'pdf'
|
26
|
+
parse_pdf(self.path)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require "faraday"
|
2
|
+
require "faraday_middleware"
|
3
|
+
require "multi_json"
|
4
|
+
require 'textminer/helpers/configuration'
|
5
|
+
require 'textminer/mined'
|
6
|
+
require 'textminer/mine_utils'
|
7
|
+
|
8
|
+
##
|
9
|
+
# Textminer::Miner
|
10
|
+
#
|
11
|
+
# Class to give back text mining object
|
12
|
+
module Textminer
|
13
|
+
class Miner #:nodoc:
|
14
|
+
attr_accessor :url
|
15
|
+
|
16
|
+
def initialize(url)
|
17
|
+
self.url = url
|
18
|
+
end
|
19
|
+
|
20
|
+
def perform
|
21
|
+
conn = Faraday.new self.url do |c|
|
22
|
+
c.use FaradayMiddleware::FollowRedirects
|
23
|
+
c.adapter :net_http
|
24
|
+
end
|
25
|
+
|
26
|
+
if is_elsevier_wiley(self.url)
|
27
|
+
res = conn.get do |req|
|
28
|
+
req.headers['CR-Clickthrough-Client-Token'] = Textminer.tdm_key
|
29
|
+
end
|
30
|
+
else
|
31
|
+
res = conn.get
|
32
|
+
end
|
33
|
+
|
34
|
+
type = detect_type(res)
|
35
|
+
path = make_path(type)
|
36
|
+
write_disk(res, path)
|
37
|
+
|
38
|
+
return Mined.new(self.url, path, type)
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
data/lib/textminer/request.rb
CHANGED
@@ -1,19 +1,36 @@
|
|
1
1
|
module Textminer
|
2
2
|
class Request #:nodoc:
|
3
3
|
attr_accessor :doi
|
4
|
+
attr_accessor :member
|
5
|
+
attr_accessor :filter
|
6
|
+
attr_accessor :limit
|
7
|
+
attr_accessor :options
|
4
8
|
|
5
|
-
def initialize(doi)
|
9
|
+
def initialize(doi, member, filter, limit, options)
|
6
10
|
self.doi = doi
|
11
|
+
self.member = member
|
12
|
+
self.filter = filter
|
13
|
+
self.limit = limit
|
14
|
+
self.options = options
|
7
15
|
end
|
8
16
|
|
9
17
|
def perform
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
18
|
+
fac = nil
|
19
|
+
|
20
|
+
if member.nil?
|
21
|
+
res = Serrano.works(ids: doi, filter: filter, limit: limit, options: options)
|
22
|
+
if doi.nil?
|
23
|
+
fac = Serrano.works(ids: doi, filter: filter, options: options, facet: 'license:*', limit: 0)
|
24
|
+
fac = fac['message']['facets']['license']['value-count'].to_s
|
25
|
+
end
|
26
|
+
else
|
27
|
+
res = Serrano.members(ids: member, filter: filter, works: true, limit: limit, options: options)
|
28
|
+
if member.nil?
|
29
|
+
fac = Serrano.member(ids: member, filter: filter, options: options, facet: 'license:*', limit: 0)
|
30
|
+
fac = fac['message']['facets']['license']['value-count'].to_s
|
31
|
+
end
|
14
32
|
end
|
15
|
-
|
16
|
-
Response.new(self.doi, coll)
|
33
|
+
Response.new(self.doi, self.member, res, fac)
|
17
34
|
end
|
18
35
|
end
|
19
36
|
end
|
data/lib/textminer/response.rb
CHANGED
@@ -1,52 +1,76 @@
|
|
1
|
+
require 'launchy'
|
2
|
+
require "textminer/link_methods_hash"
|
3
|
+
require "textminer/link_methods_array"
|
4
|
+
|
1
5
|
module Textminer
|
2
6
|
class Response #:nodoc:
|
3
|
-
attr_reader :doi, :response
|
7
|
+
attr_reader :doi, :member, :response, :facet
|
4
8
|
|
5
|
-
def initialize(doi,
|
9
|
+
def initialize(doi, member, response, facet)
|
6
10
|
@doi = doi
|
7
|
-
@
|
11
|
+
@member = member
|
12
|
+
@response = response
|
13
|
+
@facet = facet
|
8
14
|
end
|
9
15
|
|
10
|
-
def
|
11
|
-
|
12
|
-
|
16
|
+
def to_s
|
17
|
+
if !@doi.nil?
|
18
|
+
if @doi.length > 3
|
19
|
+
ending = '...'
|
20
|
+
else
|
21
|
+
ending = ''
|
22
|
+
end
|
23
|
+
tt = sprintf('dois: %s %s', Array(@doi)[0..2].join(', '), ending)
|
24
|
+
end
|
25
|
+
if !@member.nil?
|
26
|
+
tt = 'member: ' + @member.to_s
|
27
|
+
end
|
28
|
+
if @doi.nil? && @member.nil?
|
29
|
+
tt = ''
|
30
|
+
end
|
31
|
+
sprintf("<textminer>: \n search: %s\n no. licenses: %s", tt, @facet)
|
13
32
|
end
|
14
33
|
|
15
|
-
def
|
16
|
-
|
17
|
-
@res.collect { |x| JSON.parse(x.body) }
|
34
|
+
def inspect
|
35
|
+
to_s
|
18
36
|
end
|
19
37
|
|
20
|
-
def
|
21
|
-
|
22
|
-
@res.collect { |x| x['message']['link'] }
|
38
|
+
def body
|
39
|
+
@response
|
23
40
|
end
|
24
41
|
|
25
|
-
def
|
26
|
-
tmp = links
|
27
|
-
|
28
|
-
tmp.collect { |z|
|
29
|
-
z.select{ |x| x['content-type'] == "application/pdf" }[0]['URL']
|
30
|
-
}
|
31
|
-
end
|
42
|
+
def links(just_urls = true)
|
43
|
+
tmp = @response.links(just_urls)
|
44
|
+
compactif(tmp)
|
32
45
|
end
|
33
46
|
|
34
|
-
def
|
35
|
-
tmp =
|
36
|
-
|
37
|
-
tmp.collect { |z|
|
38
|
-
z.select{ |x| x['content-type'] == "application/xml" }[0]['URL']
|
39
|
-
}
|
40
|
-
end
|
47
|
+
def links_xml(just_urls = true)
|
48
|
+
tmp = @response.links_xml(just_urls)
|
49
|
+
compactif(tmp)
|
41
50
|
end
|
42
51
|
|
43
|
-
def
|
44
|
-
|
52
|
+
def links_pdf(just_urls = true)
|
53
|
+
tmp = @response.links_pdf(just_urls)
|
54
|
+
compactif(tmp)
|
45
55
|
end
|
46
56
|
|
47
|
-
|
57
|
+
def links_plain(just_urls = true)
|
58
|
+
tmp = @response.links_plain(just_urls)
|
59
|
+
compactif(tmp)
|
60
|
+
end
|
48
61
|
|
49
|
-
|
62
|
+
protected
|
50
63
|
|
64
|
+
def compactif(z)
|
65
|
+
if z.nil?
|
66
|
+
return z
|
67
|
+
else
|
68
|
+
return z.compact
|
69
|
+
end
|
70
|
+
end
|
71
|
+
# def browse
|
72
|
+
# url = 'http://doi.org/' + @doi
|
73
|
+
# Launchy.open(url)
|
74
|
+
# end
|
51
75
|
end
|
52
76
|
end
|
data/lib/textminer/version.rb
CHANGED
data/textminer.gemspec
CHANGED
@@ -6,7 +6,7 @@ require 'textminer/version'
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = 'textminer'
|
8
8
|
s.version = Textminer::VERSION
|
9
|
-
s.date = '2015-
|
9
|
+
s.date = '2015-12-04'
|
10
10
|
s.summary = "Interact with Crossref's Text and Data mining API"
|
11
11
|
s.description = "Search Crossref's search API for full text content, and get full text content."
|
12
12
|
s.authors = "Scott Chamberlain"
|
@@ -15,7 +15,6 @@ Gem::Specification.new do |s|
|
|
15
15
|
s.licenses = 'MIT'
|
16
16
|
|
17
17
|
s.files = `git ls-files -z`.split("\x0").reject {|f| f.match(%r{^(test|spec|features)/}) }
|
18
|
-
s.test_files = ["test/test_tdm.rb"]
|
19
18
|
s.require_paths = ["lib"]
|
20
19
|
|
21
20
|
s.bindir = 'bin'
|
@@ -27,9 +26,16 @@ Gem::Specification.new do |s|
|
|
27
26
|
s.add_development_dependency "oga", '~> 1.2'
|
28
27
|
s.add_development_dependency "simplecov", '~> 0.10'
|
29
28
|
s.add_development_dependency "codecov", '~> 0.1'
|
29
|
+
|
30
|
+
s.add_runtime_dependency 'serrano', '~> 0.1.4.1'
|
30
31
|
s.add_runtime_dependency 'httparty', '~> 0.13'
|
31
32
|
s.add_runtime_dependency 'thor', '~> 0.19'
|
32
33
|
s.add_runtime_dependency 'json', '~> 1.8'
|
33
|
-
s.add_runtime_dependency '
|
34
|
+
s.add_runtime_dependency 'multi_json', '~> 1.0'
|
35
|
+
s.add_runtime_dependency 'faraday', '~> 0.9.1'
|
36
|
+
s.add_runtime_dependency 'faraday_middleware', '~> 0.10.0'
|
37
|
+
s.add_runtime_dependency 'launchy', '~> 2.4', '>= 2.4.3'
|
34
38
|
s.add_runtime_dependency 'pdf-reader','~> 1.3'
|
39
|
+
s.add_runtime_dependency 'nokogiri', '~> 1.6', '>= 1.6.6.2'
|
40
|
+
s.add_runtime_dependency 'uuidtools', '~> 2.1', '>= 2.1.5'
|
35
41
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textminer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Scott Chamberlain
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-12-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0.1'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: serrano
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: 0.1.4.1
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 0.1.4.1
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: httparty
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,6 +150,48 @@ dependencies:
|
|
136
150
|
- - "~>"
|
137
151
|
- !ruby/object:Gem::Version
|
138
152
|
version: '1.8'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: multi_json
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '1.0'
|
160
|
+
type: :runtime
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - "~>"
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '1.0'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: faraday
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
170
|
+
requirements:
|
171
|
+
- - "~>"
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: 0.9.1
|
174
|
+
type: :runtime
|
175
|
+
prerelease: false
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
177
|
+
requirements:
|
178
|
+
- - "~>"
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
version: 0.9.1
|
181
|
+
- !ruby/object:Gem::Dependency
|
182
|
+
name: faraday_middleware
|
183
|
+
requirement: !ruby/object:Gem::Requirement
|
184
|
+
requirements:
|
185
|
+
- - "~>"
|
186
|
+
- !ruby/object:Gem::Version
|
187
|
+
version: 0.10.0
|
188
|
+
type: :runtime
|
189
|
+
prerelease: false
|
190
|
+
version_requirements: !ruby/object:Gem::Requirement
|
191
|
+
requirements:
|
192
|
+
- - "~>"
|
193
|
+
- !ruby/object:Gem::Version
|
194
|
+
version: 0.10.0
|
139
195
|
- !ruby/object:Gem::Dependency
|
140
196
|
name: launchy
|
141
197
|
requirement: !ruby/object:Gem::Requirement
|
@@ -145,7 +201,7 @@ dependencies:
|
|
145
201
|
version: '2.4'
|
146
202
|
- - ">="
|
147
203
|
- !ruby/object:Gem::Version
|
148
|
-
version: 2.4.
|
204
|
+
version: 2.4.3
|
149
205
|
type: :runtime
|
150
206
|
prerelease: false
|
151
207
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -155,7 +211,7 @@ dependencies:
|
|
155
211
|
version: '2.4'
|
156
212
|
- - ">="
|
157
213
|
- !ruby/object:Gem::Version
|
158
|
-
version: 2.4.
|
214
|
+
version: 2.4.3
|
159
215
|
- !ruby/object:Gem::Dependency
|
160
216
|
name: pdf-reader
|
161
217
|
requirement: !ruby/object:Gem::Requirement
|
@@ -170,6 +226,46 @@ dependencies:
|
|
170
226
|
- - "~>"
|
171
227
|
- !ruby/object:Gem::Version
|
172
228
|
version: '1.3'
|
229
|
+
- !ruby/object:Gem::Dependency
|
230
|
+
name: nokogiri
|
231
|
+
requirement: !ruby/object:Gem::Requirement
|
232
|
+
requirements:
|
233
|
+
- - "~>"
|
234
|
+
- !ruby/object:Gem::Version
|
235
|
+
version: '1.6'
|
236
|
+
- - ">="
|
237
|
+
- !ruby/object:Gem::Version
|
238
|
+
version: 1.6.6.2
|
239
|
+
type: :runtime
|
240
|
+
prerelease: false
|
241
|
+
version_requirements: !ruby/object:Gem::Requirement
|
242
|
+
requirements:
|
243
|
+
- - "~>"
|
244
|
+
- !ruby/object:Gem::Version
|
245
|
+
version: '1.6'
|
246
|
+
- - ">="
|
247
|
+
- !ruby/object:Gem::Version
|
248
|
+
version: 1.6.6.2
|
249
|
+
- !ruby/object:Gem::Dependency
|
250
|
+
name: uuidtools
|
251
|
+
requirement: !ruby/object:Gem::Requirement
|
252
|
+
requirements:
|
253
|
+
- - "~>"
|
254
|
+
- !ruby/object:Gem::Version
|
255
|
+
version: '2.1'
|
256
|
+
- - ">="
|
257
|
+
- !ruby/object:Gem::Version
|
258
|
+
version: 2.1.5
|
259
|
+
type: :runtime
|
260
|
+
prerelease: false
|
261
|
+
version_requirements: !ruby/object:Gem::Requirement
|
262
|
+
requirements:
|
263
|
+
- - "~>"
|
264
|
+
- !ruby/object:Gem::Version
|
265
|
+
version: '2.1'
|
266
|
+
- - ">="
|
267
|
+
- !ruby/object:Gem::Version
|
268
|
+
version: 2.1.5
|
173
269
|
description: Search Crossref's search API for full text content, and get full text
|
174
270
|
content.
|
175
271
|
email: myrmecocystus@gmail.com
|
@@ -180,18 +276,25 @@ extra_rdoc_files: []
|
|
180
276
|
files:
|
181
277
|
- ".gitignore"
|
182
278
|
- ".travis.yml"
|
279
|
+
- CHANGELOG.md
|
183
280
|
- Gemfile
|
184
281
|
- Gemfile.lock
|
185
|
-
- NEWS.md
|
186
282
|
- README.md
|
187
283
|
- Rakefile
|
188
284
|
- bin/tm
|
285
|
+
- extra/fetch.rb
|
286
|
+
- extra/fetch_method.rb
|
189
287
|
- lib/textminer.rb
|
190
|
-
- lib/textminer/
|
288
|
+
- lib/textminer/helpers/configuration.rb
|
289
|
+
- lib/textminer/link_methods_array.rb
|
290
|
+
- lib/textminer/link_methods_hash.rb
|
291
|
+
- lib/textminer/mine_utils.rb
|
292
|
+
- lib/textminer/mined.rb
|
293
|
+
- lib/textminer/miner.rb
|
191
294
|
- lib/textminer/request.rb
|
192
295
|
- lib/textminer/response.rb
|
296
|
+
- lib/textminer/tmutils.rb
|
193
297
|
- lib/textminer/version.rb
|
194
|
-
- test/test_tdm.rb
|
195
298
|
- textminer.gemspec
|
196
299
|
homepage: http://github.com/sckott/textminer
|
197
300
|
licenses:
|
@@ -213,10 +316,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
213
316
|
version: '0'
|
214
317
|
requirements: []
|
215
318
|
rubyforge_project:
|
216
|
-
rubygems_version: 2.4.5
|
319
|
+
rubygems_version: 2.4.5.1
|
217
320
|
signing_key:
|
218
321
|
specification_version: 4
|
219
322
|
summary: Interact with Crossref's Text and Data mining API
|
220
|
-
test_files:
|
221
|
-
- test/test_tdm.rb
|
323
|
+
test_files: []
|
222
324
|
has_rdoc:
|
data/NEWS.md
DELETED
data/test/test_tdm.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
require 'simplecov'
|
2
|
-
SimpleCov.start
|
3
|
-
if ENV['CI']=='true'
|
4
|
-
require 'codecov'
|
5
|
-
SimpleCov.formatter = SimpleCov::Formatter::Codecov
|
6
|
-
end
|
7
|
-
|
8
|
-
require "textminer"
|
9
|
-
require 'fileutils'
|
10
|
-
require "test/unit"
|
11
|
-
require "oga"
|
12
|
-
|
13
|
-
class TestResponse < Test::Unit::TestCase
|
14
|
-
|
15
|
-
def setup
|
16
|
-
@doi = '10.5555/515151'
|
17
|
-
@doi2 = "10.3897/phytokeys.42.7604"
|
18
|
-
@pdf = ["http://annalsofpsychoceramics.labs.crossref.org/fulltext/10.5555/515151.pdf"]
|
19
|
-
@xml = ["http://annalsofpsychoceramics.labs.crossref.org/fulltext/10.5555/515151.xml"]
|
20
|
-
end
|
21
|
-
|
22
|
-
def test_links_endpoint
|
23
|
-
assert_equal(Textminer::Response, Textminer.links(@doi).class)
|
24
|
-
end
|
25
|
-
|
26
|
-
def test_doi
|
27
|
-
assert_equal(@doi, Textminer.links(@doi).doi)
|
28
|
-
end
|
29
|
-
|
30
|
-
def test_pdf
|
31
|
-
assert_equal(@pdf, Textminer.links(@doi).pdf)
|
32
|
-
end
|
33
|
-
|
34
|
-
def test_xml
|
35
|
-
assert_equal(@xml, Textminer.links(@doi).xml)
|
36
|
-
end
|
37
|
-
|
38
|
-
def test_fetch_xml
|
39
|
-
res = Textminer.fetch(@doi2, "xml")
|
40
|
-
assert_equal(HTTParty::Response, res[0].class)
|
41
|
-
assert_true(res[0].ok?)
|
42
|
-
assert_equal(String, res[0].body.class)
|
43
|
-
assert_equal("PhytoKeys", Oga.parse_xml(res[0].body).xpath('//journal-meta//journal-id').text)
|
44
|
-
end
|
45
|
-
|
46
|
-
# def test_fetch_pdf
|
47
|
-
# res = Textminer.fetch(@doi2, "pdf")
|
48
|
-
# assert_equal(HTTParty::Response, res.class)
|
49
|
-
# assert_true(res.ok?)
|
50
|
-
# end
|
51
|
-
|
52
|
-
end
|