celes-web 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/.rvmrc +1 -0
- data/Gemfile +7 -0
- data/Gemfile.lock +32 -0
- data/README.md +44 -0
- data/celes.gemspec +24 -0
- data/lib/celes.rb +71 -0
- data/spec/celes_spec.rb +142 -0
- data/spec/spec_helper.rb +17 -0
- metadata +123 -0
data/.gitignore
ADDED
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm use --create 1.9.3@celes
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
celes-web (1.0.0)
|
5
|
+
nokogiri
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: https://rubygems.org/
|
9
|
+
specs:
|
10
|
+
diff-lcs (1.2.5)
|
11
|
+
mini_portile (0.5.2)
|
12
|
+
nokogiri (1.6.0)
|
13
|
+
mini_portile (~> 0.5.0)
|
14
|
+
rake (10.1.0)
|
15
|
+
rspec (2.14.1)
|
16
|
+
rspec-core (~> 2.14.0)
|
17
|
+
rspec-expectations (~> 2.14.0)
|
18
|
+
rspec-mocks (~> 2.14.0)
|
19
|
+
rspec-core (2.14.7)
|
20
|
+
rspec-expectations (2.14.4)
|
21
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
22
|
+
rspec-mocks (2.14.4)
|
23
|
+
|
24
|
+
PLATFORMS
|
25
|
+
ruby
|
26
|
+
|
27
|
+
DEPENDENCIES
|
28
|
+
bundler (~> 1.3)
|
29
|
+
celes-web!
|
30
|
+
nokogiri
|
31
|
+
rake
|
32
|
+
rspec
|
data/README.md
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
Celes
|
2
|
+
=====
|
3
|
+
|
4
|
+
A simple Ruby gem for parsing snippets of text and images based on the HTML content at a provided URL.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
gem 'celes-web'
|
11
|
+
|
12
|
+
And then execute:
|
13
|
+
|
14
|
+
$ bundle install
|
15
|
+
|
16
|
+
Or install it yourself as:
|
17
|
+
|
18
|
+
$ gem install 'celes-web'
|
19
|
+
|
20
|
+
## Usage
|
21
|
+
|
22
|
+
Require 'celes' and then construct a Celes object providing the URL to parse
|
23
|
+
|
24
|
+
require 'celes'
|
25
|
+
c = Celes.new(url: 'http://www.yahoo.com')
|
26
|
+
|
27
|
+
Calling '#snippets' without any options will get the text snippets that are at least 40 characters long
|
28
|
+
|
29
|
+
c.snippets #=> ['Array', 'of', 'Strings']
|
30
|
+
|
31
|
+
Calling '#images' without any options will get the source field for any images on the page
|
32
|
+
|
33
|
+
c.images #=> ['Array', 'of', 'Strings']
|
34
|
+
|
35
|
+
### Options
|
36
|
+
|
37
|
+
Use 'min_snippet_length' to define the minimum length of text in the document node to be considered a "snippet". The default is 40 characters.
|
38
|
+
|
39
|
+
c = Celes.new(url: 'http://www.yahoo.com', min_snippet_length: 100)
|
40
|
+
|
41
|
+
Use 'snip_length' to truncate longer snippets down to a shorter length. By default, Celes will truncate snippets to 140 characters.
|
42
|
+
|
43
|
+
c = Celes.new(url: 'http://www.yahoo.com', snip_length: 45)
|
44
|
+
|
data/celes.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "celes-web"
|
7
|
+
spec.version = '1.0.0'
|
8
|
+
spec.authors = ["Andy Schrage"]
|
9
|
+
spec.email = ["ajschrag@mtu.edu"]
|
10
|
+
spec.description = %q{A simple Ruby gem for providing a snipet of text and images based on a url.}
|
11
|
+
spec.summary = %q{A simple Ruby gem for providing a snipet of text and thumbnails based on a url.}
|
12
|
+
spec.homepage = "https://github.com/Swimminschrage/celes"
|
13
|
+
spec.license = "MIT"
|
14
|
+
|
15
|
+
spec.files = `git ls-files`.split($/)
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ["lib"]
|
19
|
+
|
20
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
21
|
+
spec.add_development_dependency "rake"
|
22
|
+
spec.add_development_dependency 'rspec'
|
23
|
+
spec.add_dependency 'nokogiri'
|
24
|
+
end
|
data/lib/celes.rb
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'net/http'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
class Celes
|
6
|
+
attr_reader :url
|
7
|
+
|
8
|
+
def initialize(options = {})
|
9
|
+
raise 'No URL provided' unless options[:url]
|
10
|
+
|
11
|
+
@url = options[:url]
|
12
|
+
@uri = URI(@url)
|
13
|
+
|
14
|
+
parse_options(options)
|
15
|
+
end
|
16
|
+
|
17
|
+
def snippets
|
18
|
+
return @snippets if @snippets
|
19
|
+
make_request
|
20
|
+
@snippets
|
21
|
+
end
|
22
|
+
|
23
|
+
def images
|
24
|
+
return @images if @images
|
25
|
+
make_request
|
26
|
+
@images
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def parse_options(options)
|
32
|
+
# Init defaults as necessary
|
33
|
+
@snip_length = options[:snip_length] || 140
|
34
|
+
@min_snippet_length = options[:min_snippet_length] || 40
|
35
|
+
|
36
|
+
# Normalize bad options
|
37
|
+
@min_snippet_length = 0 if @min_snippet_length < 0
|
38
|
+
@snip_length = 0 if @snip_length < 0
|
39
|
+
end
|
40
|
+
|
41
|
+
def make_request
|
42
|
+
resp = get_html_content
|
43
|
+
if resp
|
44
|
+
parse_html resp
|
45
|
+
else
|
46
|
+
raise "Unable to reach #{@url.to_s}"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def get_html_content
|
51
|
+
resp = Net::HTTP.get_response(@uri)
|
52
|
+
case resp
|
53
|
+
when Net::HTTPSuccess then resp.body
|
54
|
+
when Net::HTTPRedirection then
|
55
|
+
@url = resp['location']
|
56
|
+
@uri = URI(@url)
|
57
|
+
get_html_content
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def parse_html (response)
|
62
|
+
doc = Nokogiri::HTML.parse(response)
|
63
|
+
|
64
|
+
@images = doc.xpath('/html/body//img/@src').map{ |x| x.value }
|
65
|
+
@snippets = doc.xpath('/html/body//*[self::p or self::span]/text()').map{ |x| x.to_s}.keep_if do |str|
|
66
|
+
str.strip.length >= @min_snippet_length
|
67
|
+
end
|
68
|
+
|
69
|
+
@snippets = @snippets.map { |x| x[0, @snip_length] }
|
70
|
+
end
|
71
|
+
end
|
data/spec/celes_spec.rb
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require_relative '../lib/celes'
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
describe Celes do
|
6
|
+
let(:url) {'http://www.theredheadproject.com'}
|
7
|
+
let(:celes) { Celes.new(url: url) }
|
8
|
+
subject{celes}
|
9
|
+
|
10
|
+
it {should respond_to(:images)}
|
11
|
+
it {should respond_to(:snippets)}
|
12
|
+
it {should respond_to(:url)}
|
13
|
+
|
14
|
+
it 'should error out when provided no url' do
|
15
|
+
expect{Celes.new}.to raise_error(RuntimeError)
|
16
|
+
end
|
17
|
+
|
18
|
+
describe 'when provided a valid url' do
|
19
|
+
its(:url) {should be == url}
|
20
|
+
|
21
|
+
context 'that causes a redirect' do
|
22
|
+
let(:url) {'http://google.com'} # Should redirect the user to http://www.google.com
|
23
|
+
let(:celes) { Celes.new(url: url) }
|
24
|
+
subject{celes}
|
25
|
+
|
26
|
+
its(:get_html_content) {should_not be nil}
|
27
|
+
end
|
28
|
+
|
29
|
+
describe 'that contains images and text snippets' do
|
30
|
+
before do
|
31
|
+
subject.stub(:get_html_content) { '<html><body><p>This is a test for some text that should be long enough</p><img src="myimg.png" width="100" height="100"/></body></html>' }
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should have only 1 image' do
|
35
|
+
expect(subject.images).to be_an Array
|
36
|
+
expect(subject.images.size).to be 1
|
37
|
+
expect(subject.images[0]).to be == 'myimg.png'
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'should have only 1 snippet' do
|
41
|
+
expect(subject.snippets).to be_an Array
|
42
|
+
expect(subject.snippets.size).to be 1
|
43
|
+
expect(subject.snippets[0]).to be == 'This is a test for some text that should be long enough'
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# Marking these tests as pending until I can figure out how to get Nokogiri to parse html that has img and p tags
|
48
|
+
# in the head.
|
49
|
+
describe 'only parses images and text snippets within the body' do
|
50
|
+
let(:url) {'http://www.theredheadproject.com'}
|
51
|
+
let(:nobody) { Celes.new(url: url) }
|
52
|
+
subject {nobody}
|
53
|
+
before do
|
54
|
+
nobody.stub(:get_html_content).and_return '<html><head><p>This is a test for some text that should be long enough</p><img src="myimg.png" width="100" height="100"/></head><body><p>Body Text</p></body></html>'
|
55
|
+
end
|
56
|
+
|
57
|
+
xit 'should have no images' do
|
58
|
+
expect(subject.images).to be_an Array
|
59
|
+
subject.images.each { |img| puts img }
|
60
|
+
expect(subject.images.size).to be 0
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
xit 'should have no snippets' do
|
65
|
+
expect(subject.snippets).to be_an Array
|
66
|
+
expect(subject.snippets.size).to be 0
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
describe 'can take options' do
|
71
|
+
context ':min_snippet_length' do
|
72
|
+
it 'defaults to 40 characters' do
|
73
|
+
c = Celes.new(url: url)
|
74
|
+
c.stub(:get_html_content).and_return '<p>Not 40 characters</p><p>Definitely 40 characters long test here!</p>'
|
75
|
+
|
76
|
+
expect(c.snippets.size).to be 1
|
77
|
+
expect(c.snippets[0]).to eq 'Definitely 40 characters long test here!'
|
78
|
+
end
|
79
|
+
|
80
|
+
it 'less than 0 defaults to 0' do
|
81
|
+
c = Celes.new(url: url, min_snippet_length: -10)
|
82
|
+
c.stub(:get_html_content).and_return '<p>Not 40 characters</p><p>Definitely 40 characters long test here!</p>'
|
83
|
+
|
84
|
+
expect(c.snippets.size).to be 2
|
85
|
+
expect(c.snippets[0]).to eq 'Not 40 characters'
|
86
|
+
expect(c.snippets[1]).to eq 'Definitely 40 characters long test here!'
|
87
|
+
end
|
88
|
+
|
89
|
+
it 'can be set to any int 0 or greater' do
|
90
|
+
c = Celes.new(url: url, min_snippet_length: 17)
|
91
|
+
c.stub(:get_html_content).and_return '<p>Not 40 characters</p><p>Definitely 40 characters long test here!</p>'
|
92
|
+
|
93
|
+
expect(c.snippets.size).to be 2
|
94
|
+
expect(c.snippets[0]).to eq 'Not 40 characters'
|
95
|
+
expect(c.snippets[1]).to eq 'Definitely 40 characters long test here!'
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'can be set to 0' do
|
99
|
+
c = Celes.new(url: url, min_snippet_length: 0)
|
100
|
+
c.stub(:get_html_content).and_return '<p>A</p><p>Not 40 characters</p><p>Definitely 40 characters long test here!</p>'
|
101
|
+
|
102
|
+
expect(c.snippets.size).to be 3
|
103
|
+
expect(c.snippets[0]).to eq 'A'
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
context ':snip_length' do
|
108
|
+
let(:text59) { 'C' * 59 }
|
109
|
+
let(:text140) { 'A' * 140 }
|
110
|
+
let(:text150) { 'B' * 150 }
|
111
|
+
|
112
|
+
before {Celes.any_instance.stub(:get_html_content).and_return "<p>#{text59}</p><p>#{text140}</p><p>#{text150}</p>"}
|
113
|
+
|
114
|
+
it 'defaults to 140 characters' do
|
115
|
+
c = Celes.new(url: url)
|
116
|
+
|
117
|
+
expect(c.snippets.size).to be 3
|
118
|
+
expect(c.snippets[0]).to eq text59
|
119
|
+
expect(c.snippets[1]).to eq text140
|
120
|
+
expect(c.snippets[2]).to eq text150[0, 140]
|
121
|
+
end
|
122
|
+
|
123
|
+
it 'defaults to 0 when a negative value is provided' do
|
124
|
+
c = Celes.new(url: url, snip_length: -100)
|
125
|
+
|
126
|
+
expect(c.snippets.size).to be 3
|
127
|
+
expect(c.snippets[0]).to be_empty
|
128
|
+
expect(c.snippets[1]).to be_empty
|
129
|
+
end
|
130
|
+
|
131
|
+
it 'can be set to be any value greater than 0' do
|
132
|
+
c = Celes.new(url: url, snip_length: 60)
|
133
|
+
|
134
|
+
expect(c.snippets.size).to be 3
|
135
|
+
expect(c.snippets[0]).to eq text59
|
136
|
+
expect(c.snippets[1]).to eq text140[0, 60]
|
137
|
+
expect(c.snippets[2]).to eq text150[0, 60]
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
9
|
+
config.run_all_when_everything_filtered = true
|
10
|
+
config.filter_run :focus
|
11
|
+
|
12
|
+
# Run specs in random order to surface order dependencies. If you find an
|
13
|
+
# order dependency and want to debug it, you can fix the order by providing
|
14
|
+
# the seed, which is printed after each run.
|
15
|
+
# --seed 1234
|
16
|
+
config.order = 'random'
|
17
|
+
end
|
metadata
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: celes-web
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Andy Schrage
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-12-17 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: bundler
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.3'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.3'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rake
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rspec
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: nokogiri
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
description: A simple Ruby gem for providing a snipet of text and images based on
|
79
|
+
a url.
|
80
|
+
email:
|
81
|
+
- ajschrag@mtu.edu
|
82
|
+
executables: []
|
83
|
+
extensions: []
|
84
|
+
extra_rdoc_files: []
|
85
|
+
files:
|
86
|
+
- .gitignore
|
87
|
+
- .rvmrc
|
88
|
+
- Gemfile
|
89
|
+
- Gemfile.lock
|
90
|
+
- README.md
|
91
|
+
- celes.gemspec
|
92
|
+
- lib/celes.rb
|
93
|
+
- spec/celes_spec.rb
|
94
|
+
- spec/spec_helper.rb
|
95
|
+
homepage: https://github.com/Swimminschrage/celes
|
96
|
+
licenses:
|
97
|
+
- MIT
|
98
|
+
post_install_message:
|
99
|
+
rdoc_options: []
|
100
|
+
require_paths:
|
101
|
+
- lib
|
102
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
103
|
+
none: false
|
104
|
+
requirements:
|
105
|
+
- - ! '>='
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
version: '0'
|
108
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
109
|
+
none: false
|
110
|
+
requirements:
|
111
|
+
- - ! '>='
|
112
|
+
- !ruby/object:Gem::Version
|
113
|
+
version: '0'
|
114
|
+
requirements: []
|
115
|
+
rubyforge_project:
|
116
|
+
rubygems_version: 1.8.25
|
117
|
+
signing_key:
|
118
|
+
specification_version: 3
|
119
|
+
summary: A simple Ruby gem for providing a snipet of text and thumbnails based on
|
120
|
+
a url.
|
121
|
+
test_files:
|
122
|
+
- spec/celes_spec.rb
|
123
|
+
- spec/spec_helper.rb
|