textract 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,43 +1,49 @@
1
- require_relative '../../lib/textract'
1
+ require 'spec_helper'
2
+ require 'textract'
2
3
 
3
- RSpec.configure do |c|
4
- # filter_run is short-form alias for filter_run_including
5
- c.filter_run :focus => true
6
- end
7
-
8
- describe Textract, :focus do
4
+ describe Textract do
9
5
  it "initializes with the get_text method" do
10
- url = "http://www.tedcruz.org/about/"
11
- article = Textract.get_text(url)
12
- expect(article).to be_a_kind_of Textract::Client
6
+ VCR.use_cassette("cruz") do
7
+ url = "http://www.tedcruz.org/about/"
8
+ article = Textract.get_text(url)
9
+ expect(article).to be_a_kind_of Textract::Client
10
+ end
13
11
  end
14
12
 
15
13
  it "returns article text based on article tag" do
16
- url = "http://gawker.com/1694508525"
17
- article = Textract.get_text(url)
18
- expect(article.text.include?("Import")).to eq true
19
- expect(article.md5).to eq "c11a810a3e73f24aac78fd3e39e69f87"
20
- expect(article.author).to eq "Hamilton Nolan"
14
+ VCR.use_cassette("hamno") do
15
+ url = "http://gawker.com/1694508525"
16
+ article = Textract.get_text(url)
17
+ expect(article.text.include?("Import")).to eq true
18
+ expect(article.md5).to eq "c11a810a3e73f24aac78fd3e39e69f87"
19
+ expect(article.author).to eq "Hamilton Nolan"
20
+ end
21
21
  end
22
22
 
23
23
  it "also includes images" do
24
- url = "http://gawker.com/1696731611"
25
- img = "http://i.kinja-img.com/gawker-media/image/upload/s--fWYFlEv6--/c_fit,fl_progressive,q_80,w_636/l3sjlg0ariqomd4ubtl6.jpg"
26
- article = Textract.get_text(url)
27
- expect(article.text.include?(img)).to be true
24
+ VCR.use_cassette('imgs') do
25
+ url = "http://gawker.com/1696731611"
26
+ img = "http://i.kinja-img.com/gawker-media/image/upload/s--fWYFlEv6--/c_fit,fl_progressive,q_80,w_636/l3sjlg0ariqomd4ubtl6.jpg"
27
+ article = Textract.get_text(url)
28
+ expect(article.text.include?(img)).to be true
29
+ end
28
30
  end
29
31
 
30
32
  it "returns article text based on opengraph description" do
31
- url = "http://www.tedcruz.org/record/our-standard-the-constitution/"
32
- article = Textract.get_text(url)
33
- expect(article.text.include?("Ted Cruz")).to eq true
33
+ VCR.use_cassette('og') do
34
+ url = "http://www.tedcruz.org/record/our-standard-the-constitution/"
35
+ article = Textract.get_text(url)
36
+ expect(article.text.include?("Ted Cruz")).to eq true
37
+ end
34
38
  end
35
39
 
36
40
  it "can find a twitter profile given a selector" do
37
- url = "https://twitter.com/lifehacker"
38
- article = Textract.get_text(url, 'p.ProfileHeaderCard-bio.u-dir')
39
- expect(article.text.strip).to eq "Don't live to geek; geek to live."
40
- expect(article.title).to eq "Lifehacker (@lifehacker) | Twitter"
41
+ VCR.use_cassette('selector') do
42
+ url = "https://twitter.com/lifehacker"
43
+ article = Textract.get_text(url, 'p.ProfileHeaderCard-bio.u-dir')
44
+ expect(article.text.strip).to eq "Don't live to geek; geek to live."
45
+ expect(article.title).to eq "Lifehacker (@lifehacker) | Twitter"
46
+ end
41
47
  end
42
48
 
43
49
  it "gets the page title from the title tag" do
@@ -51,9 +57,19 @@ describe Textract, :focus do
51
57
  end
52
58
 
53
59
  it "converts itself to json" do
54
- url = "http://gawker.com/1694508525"
55
- article = Textract.get_text(url)
56
- expect(article.to_json).to be_a_kind_of String
60
+ VCR.use_cassette('json') do
61
+ url = "http://gawker.com/1694508525"
62
+ article = Textract.get_text(url)
63
+ expect(article.to_json).to be_a_kind_of String
64
+ end
65
+ end
66
+
67
+ it "handles problem urls" do
68
+ VCR.use_cassette('bad frisky') do
69
+ url = "http://www.thefrisky.com/2015-04-22/10-things-i-was-irrationally-jealous-of-in-high-school-and-admittedly-still-am/"
70
+ article = Textract.get_text(url)
71
+ expect(article.to_json).to be_a_kind_of String
72
+ end
57
73
  end
58
74
 
59
75
  end
@@ -0,0 +1,12 @@
1
+ RSpec.configure do |c|
2
+ c.filter_run_including :focus => true
3
+ c.run_all_when_everything_filtered = true
4
+ end
5
+
6
+ require 'vcr'
7
+ VCR.configure do |config|
8
+ config.cassette_library_dir = "spec/fixtures/vcr_cassettes"
9
+ config.hook_into :webmock
10
+ config.allow_http_connections_when_no_cassette = true
11
+ end
12
+
data/textract.gemspec CHANGED
@@ -27,4 +27,6 @@ Gem::Specification.new do |spec|
27
27
  spec.add_development_dependency "rake", "~> 10.0"
28
28
  spec.add_development_dependency "rspec"
29
29
  spec.add_development_dependency "guard-rspec"
30
+ spec.add_development_dependency "vcr"
31
+ spec.add_development_dependency "webmock"
30
32
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Pash
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-10 00:00:00.000000000 Z
11
+ date: 2015-04-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: opengraph_parser
@@ -122,6 +122,34 @@ dependencies:
122
122
  - - ">="
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: vcr
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: webmock
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
125
153
  description: Extracts article text from a URL
126
154
  email:
127
155
  - adam.pash@gmail.com
@@ -137,7 +165,15 @@ files:
137
165
  - Rakefile
138
166
  - lib/textract.rb
139
167
  - lib/textract/version.rb
168
+ - spec/fixtures/vcr_cassettes/bad_frisky.yml
169
+ - spec/fixtures/vcr_cassettes/cruz.yml
170
+ - spec/fixtures/vcr_cassettes/hamno.yml
171
+ - spec/fixtures/vcr_cassettes/imgs.yml
172
+ - spec/fixtures/vcr_cassettes/json.yml
173
+ - spec/fixtures/vcr_cassettes/og.yml
174
+ - spec/fixtures/vcr_cassettes/selector.yml
140
175
  - spec/lib/textract_spec.rb
176
+ - spec/spec_helper.rb
141
177
  - textract.gemspec
142
178
  homepage: ''
143
179
  licenses:
@@ -164,4 +200,12 @@ signing_key:
164
200
  specification_version: 4
165
201
  summary: Extracts article text from a URL
166
202
  test_files:
203
+ - spec/fixtures/vcr_cassettes/bad_frisky.yml
204
+ - spec/fixtures/vcr_cassettes/cruz.yml
205
+ - spec/fixtures/vcr_cassettes/hamno.yml
206
+ - spec/fixtures/vcr_cassettes/imgs.yml
207
+ - spec/fixtures/vcr_cassettes/json.yml
208
+ - spec/fixtures/vcr_cassettes/og.yml
209
+ - spec/fixtures/vcr_cassettes/selector.yml
167
210
  - spec/lib/textract_spec.rb
211
+ - spec/spec_helper.rb