textract 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/textract/version.rb +1 -1
- data/lib/textract.rb +11 -7
- data/spec/fixtures/vcr_cassettes/bad_frisky.yml +1866 -0
- data/spec/fixtures/vcr_cassettes/cruz.yml +642 -0
- data/spec/fixtures/vcr_cassettes/hamno.yml +632 -0
- data/spec/fixtures/vcr_cassettes/imgs.yml +753 -0
- data/spec/fixtures/vcr_cassettes/json.yml +632 -0
- data/spec/fixtures/vcr_cassettes/og.yml +622 -0
- data/spec/fixtures/vcr_cassettes/selector.yml +684 -0
- data/spec/lib/textract_spec.rb +45 -29
- data/spec/spec_helper.rb +12 -0
- data/textract.gemspec +2 -0
- metadata +46 -2
data/spec/lib/textract_spec.rb
CHANGED
@@ -1,43 +1,49 @@
|
|
1
|
-
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'textract'
|
2
3
|
|
3
|
-
|
4
|
-
# filter_run is short-form alias for filter_run_including
|
5
|
-
c.filter_run :focus => true
|
6
|
-
end
|
7
|
-
|
8
|
-
describe Textract, :focus do
|
4
|
+
describe Textract do
|
9
5
|
it "initializes with the get_text method" do
|
10
|
-
|
11
|
-
|
12
|
-
|
6
|
+
VCR.use_cassette("cruz") do
|
7
|
+
url = "http://www.tedcruz.org/about/"
|
8
|
+
article = Textract.get_text(url)
|
9
|
+
expect(article).to be_a_kind_of Textract::Client
|
10
|
+
end
|
13
11
|
end
|
14
12
|
|
15
13
|
it "returns article text based on article tag" do
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
14
|
+
VCR.use_cassette("hamno") do
|
15
|
+
url = "http://gawker.com/1694508525"
|
16
|
+
article = Textract.get_text(url)
|
17
|
+
expect(article.text.include?("Import")).to eq true
|
18
|
+
expect(article.md5).to eq "c11a810a3e73f24aac78fd3e39e69f87"
|
19
|
+
expect(article.author).to eq "Hamilton Nolan"
|
20
|
+
end
|
21
21
|
end
|
22
22
|
|
23
23
|
it "also includes images" do
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
24
|
+
VCR.use_cassette('imgs') do
|
25
|
+
url = "http://gawker.com/1696731611"
|
26
|
+
img = "http://i.kinja-img.com/gawker-media/image/upload/s--fWYFlEv6--/c_fit,fl_progressive,q_80,w_636/l3sjlg0ariqomd4ubtl6.jpg"
|
27
|
+
article = Textract.get_text(url)
|
28
|
+
expect(article.text.include?(img)).to be true
|
29
|
+
end
|
28
30
|
end
|
29
31
|
|
30
32
|
it "returns article text based on opengraph description" do
|
31
|
-
|
32
|
-
|
33
|
-
|
33
|
+
VCR.use_cassette('og') do
|
34
|
+
url = "http://www.tedcruz.org/record/our-standard-the-constitution/"
|
35
|
+
article = Textract.get_text(url)
|
36
|
+
expect(article.text.include?("Ted Cruz")).to eq true
|
37
|
+
end
|
34
38
|
end
|
35
39
|
|
36
40
|
it "can find a twitter profile given a selector" do
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
+
VCR.use_cassette('selector') do
|
42
|
+
url = "https://twitter.com/lifehacker"
|
43
|
+
article = Textract.get_text(url, 'p.ProfileHeaderCard-bio.u-dir')
|
44
|
+
expect(article.text.strip).to eq "Don't live to geek; geek to live."
|
45
|
+
expect(article.title).to eq "Lifehacker (@lifehacker) | Twitter"
|
46
|
+
end
|
41
47
|
end
|
42
48
|
|
43
49
|
it "gets the page title from the title tag" do
|
@@ -51,9 +57,19 @@ describe Textract, :focus do
|
|
51
57
|
end
|
52
58
|
|
53
59
|
it "converts itself to json" do
|
54
|
-
|
55
|
-
|
56
|
-
|
60
|
+
VCR.use_cassette('json') do
|
61
|
+
url = "http://gawker.com/1694508525"
|
62
|
+
article = Textract.get_text(url)
|
63
|
+
expect(article.to_json).to be_a_kind_of String
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
it "handles problem urls" do
|
68
|
+
VCR.use_cassette('bad frisky') do
|
69
|
+
url = "http://www.thefrisky.com/2015-04-22/10-things-i-was-irrationally-jealous-of-in-high-school-and-admittedly-still-am/"
|
70
|
+
article = Textract.get_text(url)
|
71
|
+
expect(article.to_json).to be_a_kind_of String
|
72
|
+
end
|
57
73
|
end
|
58
74
|
|
59
75
|
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
RSpec.configure do |c|
|
2
|
+
c.filter_run_including :focus => true
|
3
|
+
c.run_all_when_everything_filtered = true
|
4
|
+
end
|
5
|
+
|
6
|
+
require 'vcr'
|
7
|
+
VCR.configure do |config|
|
8
|
+
config.cassette_library_dir = "spec/fixtures/vcr_cassettes"
|
9
|
+
config.hook_into :webmock
|
10
|
+
config.allow_http_connections_when_no_cassette = true
|
11
|
+
end
|
12
|
+
|
data/textract.gemspec
CHANGED
@@ -27,4 +27,6 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.add_development_dependency "rake", "~> 10.0"
|
28
28
|
spec.add_development_dependency "rspec"
|
29
29
|
spec.add_development_dependency "guard-rspec"
|
30
|
+
spec.add_development_dependency "vcr"
|
31
|
+
spec.add_development_dependency "webmock"
|
30
32
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Pash
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-04-
|
11
|
+
date: 2015-04-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: opengraph_parser
|
@@ -122,6 +122,34 @@ dependencies:
|
|
122
122
|
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: vcr
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - ">="
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: webmock
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - ">="
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
125
153
|
description: Extracts article text from a URL
|
126
154
|
email:
|
127
155
|
- adam.pash@gmail.com
|
@@ -137,7 +165,15 @@ files:
|
|
137
165
|
- Rakefile
|
138
166
|
- lib/textract.rb
|
139
167
|
- lib/textract/version.rb
|
168
|
+
- spec/fixtures/vcr_cassettes/bad_frisky.yml
|
169
|
+
- spec/fixtures/vcr_cassettes/cruz.yml
|
170
|
+
- spec/fixtures/vcr_cassettes/hamno.yml
|
171
|
+
- spec/fixtures/vcr_cassettes/imgs.yml
|
172
|
+
- spec/fixtures/vcr_cassettes/json.yml
|
173
|
+
- spec/fixtures/vcr_cassettes/og.yml
|
174
|
+
- spec/fixtures/vcr_cassettes/selector.yml
|
140
175
|
- spec/lib/textract_spec.rb
|
176
|
+
- spec/spec_helper.rb
|
141
177
|
- textract.gemspec
|
142
178
|
homepage: ''
|
143
179
|
licenses:
|
@@ -164,4 +200,12 @@ signing_key:
|
|
164
200
|
specification_version: 4
|
165
201
|
summary: Extracts article text from a URL
|
166
202
|
test_files:
|
203
|
+
- spec/fixtures/vcr_cassettes/bad_frisky.yml
|
204
|
+
- spec/fixtures/vcr_cassettes/cruz.yml
|
205
|
+
- spec/fixtures/vcr_cassettes/hamno.yml
|
206
|
+
- spec/fixtures/vcr_cassettes/imgs.yml
|
207
|
+
- spec/fixtures/vcr_cassettes/json.yml
|
208
|
+
- spec/fixtures/vcr_cassettes/og.yml
|
209
|
+
- spec/fixtures/vcr_cassettes/selector.yml
|
167
210
|
- spec/lib/textract_spec.rb
|
211
|
+
- spec/spec_helper.rb
|