celes-web 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/.rvmrc +1 -0
- data/Gemfile +7 -0
- data/Gemfile.lock +32 -0
- data/README.md +44 -0
- data/celes.gemspec +24 -0
- data/lib/celes.rb +71 -0
- data/spec/celes_spec.rb +142 -0
- data/spec/spec_helper.rb +17 -0
- metadata +123 -0
data/.gitignore
ADDED
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm use --create 1.9.3@celes
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
celes-web (1.0.0)
|
5
|
+
nokogiri
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: https://rubygems.org/
|
9
|
+
specs:
|
10
|
+
diff-lcs (1.2.5)
|
11
|
+
mini_portile (0.5.2)
|
12
|
+
nokogiri (1.6.0)
|
13
|
+
mini_portile (~> 0.5.0)
|
14
|
+
rake (10.1.0)
|
15
|
+
rspec (2.14.1)
|
16
|
+
rspec-core (~> 2.14.0)
|
17
|
+
rspec-expectations (~> 2.14.0)
|
18
|
+
rspec-mocks (~> 2.14.0)
|
19
|
+
rspec-core (2.14.7)
|
20
|
+
rspec-expectations (2.14.4)
|
21
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
22
|
+
rspec-mocks (2.14.4)
|
23
|
+
|
24
|
+
PLATFORMS
|
25
|
+
ruby
|
26
|
+
|
27
|
+
DEPENDENCIES
|
28
|
+
bundler (~> 1.3)
|
29
|
+
celes-web!
|
30
|
+
nokogiri
|
31
|
+
rake
|
32
|
+
rspec
|
data/README.md
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
Celes
|
2
|
+
=====
|
3
|
+
|
4
|
+
A simple Ruby gem for parsing snippets of text and images based on the HTML content at a provided URL.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
gem 'celes-web'
|
11
|
+
|
12
|
+
And then execute:
|
13
|
+
|
14
|
+
$ bundle install
|
15
|
+
|
16
|
+
Or install it yourself as:
|
17
|
+
|
18
|
+
$ gem install 'celes-web'
|
19
|
+
|
20
|
+
## Usage
|
21
|
+
|
22
|
+
Require 'celes' and then construct a Celes object providing the URL to parse
|
23
|
+
|
24
|
+
require 'celes'
|
25
|
+
c = Celes.new(url: 'http://www.yahoo.com')
|
26
|
+
|
27
|
+
Calling '#snippets' without any options will get the text snippets that are at least 40 characters long
|
28
|
+
|
29
|
+
c.snippets #=> ['Array', 'of', 'Strings']
|
30
|
+
|
31
|
+
Calling '#images' without any options will get the source field for any images on the page
|
32
|
+
|
33
|
+
c.images #=> ['Array', 'of', 'Strings']
|
34
|
+
|
35
|
+
### Options
|
36
|
+
|
37
|
+
Use 'min_snippet_length' to define the minimum length of text in the document node to be considered a "snippet". The default is 40 characters.
|
38
|
+
|
39
|
+
c = Celes.new(url: 'http://www.yahoo.com', min_snippet_length: 100)
|
40
|
+
|
41
|
+
Use 'snip_length' to truncate longer snippets down to a shorter length. By default, Celes will truncate snippets to 140 characters.
|
42
|
+
|
43
|
+
c = Celes.new(url: 'http://www.yahoo.com', snip_length: 45)
|
44
|
+
|
data/celes.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "celes-web"
|
7
|
+
spec.version = '1.0.0'
|
8
|
+
spec.authors = ["Andy Schrage"]
|
9
|
+
spec.email = ["ajschrag@mtu.edu"]
|
10
|
+
spec.description = %q{A simple Ruby gem for providing a snipet of text and images based on a url.}
|
11
|
+
spec.summary = %q{A simple Ruby gem for providing a snipet of text and thumbnails based on a url.}
|
12
|
+
spec.homepage = "https://github.com/Swimminschrage/celes"
|
13
|
+
spec.license = "MIT"
|
14
|
+
|
15
|
+
spec.files = `git ls-files`.split($/)
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ["lib"]
|
19
|
+
|
20
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
21
|
+
spec.add_development_dependency "rake"
|
22
|
+
spec.add_development_dependency 'rspec'
|
23
|
+
spec.add_dependency 'nokogiri'
|
24
|
+
end
|
data/lib/celes.rb
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'net/http'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
class Celes
|
6
|
+
attr_reader :url
|
7
|
+
|
8
|
+
def initialize(options = {})
|
9
|
+
raise 'No URL provided' unless options[:url]
|
10
|
+
|
11
|
+
@url = options[:url]
|
12
|
+
@uri = URI(@url)
|
13
|
+
|
14
|
+
parse_options(options)
|
15
|
+
end
|
16
|
+
|
17
|
+
def snippets
|
18
|
+
return @snippets if @snippets
|
19
|
+
make_request
|
20
|
+
@snippets
|
21
|
+
end
|
22
|
+
|
23
|
+
def images
|
24
|
+
return @images if @images
|
25
|
+
make_request
|
26
|
+
@images
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def parse_options(options)
|
32
|
+
# Init defaults as necessary
|
33
|
+
@snip_length = options[:snip_length] || 140
|
34
|
+
@min_snippet_length = options[:min_snippet_length] || 40
|
35
|
+
|
36
|
+
# Normalize bad options
|
37
|
+
@min_snippet_length = 0 if @min_snippet_length < 0
|
38
|
+
@snip_length = 0 if @snip_length < 0
|
39
|
+
end
|
40
|
+
|
41
|
+
def make_request
|
42
|
+
resp = get_html_content
|
43
|
+
if resp
|
44
|
+
parse_html resp
|
45
|
+
else
|
46
|
+
raise "Unable to reach #{@url.to_s}"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def get_html_content
|
51
|
+
resp = Net::HTTP.get_response(@uri)
|
52
|
+
case resp
|
53
|
+
when Net::HTTPSuccess then resp.body
|
54
|
+
when Net::HTTPRedirection then
|
55
|
+
@url = resp['location']
|
56
|
+
@uri = URI(@url)
|
57
|
+
get_html_content
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def parse_html (response)
|
62
|
+
doc = Nokogiri::HTML.parse(response)
|
63
|
+
|
64
|
+
@images = doc.xpath('/html/body//img/@src').map{ |x| x.value }
|
65
|
+
@snippets = doc.xpath('/html/body//*[self::p or self::span]/text()').map{ |x| x.to_s}.keep_if do |str|
|
66
|
+
str.strip.length >= @min_snippet_length
|
67
|
+
end
|
68
|
+
|
69
|
+
@snippets = @snippets.map { |x| x[0, @snip_length] }
|
70
|
+
end
|
71
|
+
end
|
data/spec/celes_spec.rb
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require_relative '../lib/celes'
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
describe Celes do
|
6
|
+
let(:url) {'http://www.theredheadproject.com'}
|
7
|
+
let(:celes) { Celes.new(url: url) }
|
8
|
+
subject{celes}
|
9
|
+
|
10
|
+
it {should respond_to(:images)}
|
11
|
+
it {should respond_to(:snippets)}
|
12
|
+
it {should respond_to(:url)}
|
13
|
+
|
14
|
+
it 'should error out when provided no url' do
|
15
|
+
expect{Celes.new}.to raise_error(RuntimeError)
|
16
|
+
end
|
17
|
+
|
18
|
+
describe 'when provided a valid url' do
|
19
|
+
its(:url) {should be == url}
|
20
|
+
|
21
|
+
context 'that causes a redirect' do
|
22
|
+
let(:url) {'http://google.com'} # Should redirect the user to http://www.google.com
|
23
|
+
let(:celes) { Celes.new(url: url) }
|
24
|
+
subject{celes}
|
25
|
+
|
26
|
+
its(:get_html_content) {should_not be nil}
|
27
|
+
end
|
28
|
+
|
29
|
+
describe 'that contains images and text snippets' do
|
30
|
+
before do
|
31
|
+
subject.stub(:get_html_content) { '<html><body><p>This is a test for some text that should be long enough</p><img src="myimg.png" width="100" height="100"/></body></html>' }
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should have only 1 image' do
|
35
|
+
expect(subject.images).to be_an Array
|
36
|
+
expect(subject.images.size).to be 1
|
37
|
+
expect(subject.images[0]).to be == 'myimg.png'
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'should have only 1 snippet' do
|
41
|
+
expect(subject.snippets).to be_an Array
|
42
|
+
expect(subject.snippets.size).to be 1
|
43
|
+
expect(subject.snippets[0]).to be == 'This is a test for some text that should be long enough'
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# Marking these tests as pending until I can figure out how to get Nokogiri to parse html that has img and p tags
|
48
|
+
# in the head.
|
49
|
+
describe 'only parses images and text snippets within the body' do
|
50
|
+
let(:url) {'http://www.theredheadproject.com'}
|
51
|
+
let(:nobody) { Celes.new(url: url) }
|
52
|
+
subject {nobody}
|
53
|
+
before do
|
54
|
+
nobody.stub(:get_html_content).and_return '<html><head><p>This is a test for some text that should be long enough</p><img src="myimg.png" width="100" height="100"/></head><body><p>Body Text</p></body></html>'
|
55
|
+
end
|
56
|
+
|
57
|
+
xit 'should have no images' do
|
58
|
+
expect(subject.images).to be_an Array
|
59
|
+
subject.images.each { |img| puts img }
|
60
|
+
expect(subject.images.size).to be 0
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
xit 'should have no snippets' do
|
65
|
+
expect(subject.snippets).to be_an Array
|
66
|
+
expect(subject.snippets.size).to be 0
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
describe 'can take options' do
|
71
|
+
context ':min_snippet_length' do
|
72
|
+
it 'defaults to 40 characters' do
|
73
|
+
c = Celes.new(url: url)
|
74
|
+
c.stub(:get_html_content).and_return '<p>Not 40 characters</p><p>Definitely 40 characters long test here!</p>'
|
75
|
+
|
76
|
+
expect(c.snippets.size).to be 1
|
77
|
+
expect(c.snippets[0]).to eq 'Definitely 40 characters long test here!'
|
78
|
+
end
|
79
|
+
|
80
|
+
it 'less than 0 defaults to 0' do
|
81
|
+
c = Celes.new(url: url, min_snippet_length: -10)
|
82
|
+
c.stub(:get_html_content).and_return '<p>Not 40 characters</p><p>Definitely 40 characters long test here!</p>'
|
83
|
+
|
84
|
+
expect(c.snippets.size).to be 2
|
85
|
+
expect(c.snippets[0]).to eq 'Not 40 characters'
|
86
|
+
expect(c.snippets[1]).to eq 'Definitely 40 characters long test here!'
|
87
|
+
end
|
88
|
+
|
89
|
+
it 'can be set to any int 0 or greater' do
|
90
|
+
c = Celes.new(url: url, min_snippet_length: 17)
|
91
|
+
c.stub(:get_html_content).and_return '<p>Not 40 characters</p><p>Definitely 40 characters long test here!</p>'
|
92
|
+
|
93
|
+
expect(c.snippets.size).to be 2
|
94
|
+
expect(c.snippets[0]).to eq 'Not 40 characters'
|
95
|
+
expect(c.snippets[1]).to eq 'Definitely 40 characters long test here!'
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'can be set to 0' do
|
99
|
+
c = Celes.new(url: url, min_snippet_length: 0)
|
100
|
+
c.stub(:get_html_content).and_return '<p>A</p><p>Not 40 characters</p><p>Definitely 40 characters long test here!</p>'
|
101
|
+
|
102
|
+
expect(c.snippets.size).to be 3
|
103
|
+
expect(c.snippets[0]).to eq 'A'
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
context ':snip_length' do
|
108
|
+
let(:text59) { 'C' * 59 }
|
109
|
+
let(:text140) { 'A' * 140 }
|
110
|
+
let(:text150) { 'B' * 150 }
|
111
|
+
|
112
|
+
before {Celes.any_instance.stub(:get_html_content).and_return "<p>#{text59}</p><p>#{text140}</p><p>#{text150}</p>"}
|
113
|
+
|
114
|
+
it 'defaults to 140 characters' do
|
115
|
+
c = Celes.new(url: url)
|
116
|
+
|
117
|
+
expect(c.snippets.size).to be 3
|
118
|
+
expect(c.snippets[0]).to eq text59
|
119
|
+
expect(c.snippets[1]).to eq text140
|
120
|
+
expect(c.snippets[2]).to eq text150[0, 140]
|
121
|
+
end
|
122
|
+
|
123
|
+
it 'defaults to 0 when a negative value is provided' do
|
124
|
+
c = Celes.new(url: url, snip_length: -100)
|
125
|
+
|
126
|
+
expect(c.snippets.size).to be 3
|
127
|
+
expect(c.snippets[0]).to be_empty
|
128
|
+
expect(c.snippets[1]).to be_empty
|
129
|
+
end
|
130
|
+
|
131
|
+
it 'can be set to be any value greater than 0' do
|
132
|
+
c = Celes.new(url: url, snip_length: 60)
|
133
|
+
|
134
|
+
expect(c.snippets.size).to be 3
|
135
|
+
expect(c.snippets[0]).to eq text59
|
136
|
+
expect(c.snippets[1]).to eq text140[0, 60]
|
137
|
+
expect(c.snippets[2]).to eq text150[0, 60]
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
9
|
+
config.run_all_when_everything_filtered = true
|
10
|
+
config.filter_run :focus
|
11
|
+
|
12
|
+
# Run specs in random order to surface order dependencies. If you find an
|
13
|
+
# order dependency and want to debug it, you can fix the order by providing
|
14
|
+
# the seed, which is printed after each run.
|
15
|
+
# --seed 1234
|
16
|
+
config.order = 'random'
|
17
|
+
end
|
metadata
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: celes-web
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Andy Schrage
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-12-17 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: bundler
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.3'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.3'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rake
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rspec
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: nokogiri
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
description: A simple Ruby gem for providing a snipet of text and images based on
|
79
|
+
a url.
|
80
|
+
email:
|
81
|
+
- ajschrag@mtu.edu
|
82
|
+
executables: []
|
83
|
+
extensions: []
|
84
|
+
extra_rdoc_files: []
|
85
|
+
files:
|
86
|
+
- .gitignore
|
87
|
+
- .rvmrc
|
88
|
+
- Gemfile
|
89
|
+
- Gemfile.lock
|
90
|
+
- README.md
|
91
|
+
- celes.gemspec
|
92
|
+
- lib/celes.rb
|
93
|
+
- spec/celes_spec.rb
|
94
|
+
- spec/spec_helper.rb
|
95
|
+
homepage: https://github.com/Swimminschrage/celes
|
96
|
+
licenses:
|
97
|
+
- MIT
|
98
|
+
post_install_message:
|
99
|
+
rdoc_options: []
|
100
|
+
require_paths:
|
101
|
+
- lib
|
102
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
103
|
+
none: false
|
104
|
+
requirements:
|
105
|
+
- - ! '>='
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
version: '0'
|
108
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
109
|
+
none: false
|
110
|
+
requirements:
|
111
|
+
- - ! '>='
|
112
|
+
- !ruby/object:Gem::Version
|
113
|
+
version: '0'
|
114
|
+
requirements: []
|
115
|
+
rubyforge_project:
|
116
|
+
rubygems_version: 1.8.25
|
117
|
+
signing_key:
|
118
|
+
specification_version: 3
|
119
|
+
summary: A simple Ruby gem for providing a snipet of text and thumbnails based on
|
120
|
+
a url.
|
121
|
+
test_files:
|
122
|
+
- spec/celes_spec.rb
|
123
|
+
- spec/spec_helper.rb
|