textract 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 052757f993bc3c87948e1d650ea837a48fef8c73
4
+ data.tar.gz: a52c485fb6af0fcafcc1e9f743c617223e181fc9
5
+ SHA512:
6
+ metadata.gz: 39c8b5162e22ca62fa182791324ae2e828712cde3d0e5a131212d4af105a40bbfe07ca6808b337900e4bc3bd492e1b210f8e0d0653035ab8f7c56d7b4e226172
7
+ data.tar.gz: 74c00abd7a1d54dd0e8bf1153aceb9b98751b070c2ced92b8257d79a7836a292ee7ba646a984e1d3aaae0b35d20c874821d6fdaf37f33ac3c019e253f77c77de
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in textract.gemspec
4
+ gemspec
data/Guardfile ADDED
@@ -0,0 +1,77 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ ## Uncomment and set this to only include directories you want to watch
5
+ # directories %w(app lib config test spec features)
6
+
7
+ ## Uncomment to clear the screen before every task
8
+ # clearing :on
9
+
10
+ ## Guard internally checks for changes in the Guardfile and exits.
11
+ ## If you want Guard to automatically start up again, run guard in a
12
+ ## shell loop, e.g.:
13
+ ##
14
+ ## $ while bundle exec guard; do echo "Restarting Guard..."; done
15
+ ##
16
+ ## Note: if you are using the `directories` clause above and you are not
17
+ ## watching the project directory ('.'), then you will want to move
18
+ ## the Guardfile to a watched dir and symlink it back, e.g.
19
+ #
20
+ # $ mkdir config
21
+ # $ mv Guardfile config/
22
+ # $ ln -s config/Guardfile .
23
+ #
24
+ # and, you'll have to watch "config/Guardfile" instead of "Guardfile"
25
+
26
+ # Note: The cmd option is now required due to the increasing number of ways
27
+ # rspec may be run, below are examples of the most common uses.
28
+ # * bundler: 'bundle exec rspec'
29
+ # * bundler binstubs: 'bin/rspec'
30
+ # * spring: 'bin/rspec' (This will use spring if running and you have
31
+ # installed the spring binstubs per the docs)
32
+ # * zeus: 'zeus rspec' (requires the server to be started separately)
33
+ # * 'just' rspec: 'rspec'
34
+
35
+ guard :rspec, cmd: "bundle exec rspec" do
36
+ require "guard/rspec/dsl"
37
+ dsl = Guard::RSpec::Dsl.new(self)
38
+
39
+ # Feel free to open issues for suggestions and improvements
40
+
41
+ # RSpec files
42
+ rspec = dsl.rspec
43
+ watch(rspec.spec_helper) { rspec.spec_dir }
44
+ watch(rspec.spec_support) { rspec.spec_dir }
45
+ watch(rspec.spec_files)
46
+
47
+ # Ruby files
48
+ ruby = dsl.ruby
49
+ dsl.watch_spec_files_for(ruby.lib_files)
50
+
51
+ # Rails files
52
+ rails = dsl.rails(view_extensions: %w(erb haml slim))
53
+ dsl.watch_spec_files_for(rails.app_files)
54
+ dsl.watch_spec_files_for(rails.views)
55
+
56
+ watch(rails.controllers) do |m|
57
+ [
58
+ rspec.spec.("routing/#{m[1]}_routing"),
59
+ rspec.spec.("controllers/#{m[1]}_controller"),
60
+ rspec.spec.("acceptance/#{m[1]}")
61
+ ]
62
+ end
63
+
64
+ # Rails config changes
65
+ watch(rails.spec_helper) { rspec.spec_dir }
66
+ watch(rails.routes) { "#{rspec.spec_dir}/routing" }
67
+ watch(rails.app_controller) { "#{rspec.spec_dir}/controllers" }
68
+
69
+ # Capybara features specs
70
+ watch(rails.view_dirs) { |m| rspec.spec.("features/#{m[1]}") }
71
+
72
+ # Turnip features and steps
73
+ watch(%r{^spec/acceptance/(.+)\.feature$})
74
+ watch(%r{^spec/acceptance/steps/(.+)_steps\.rb$}) do |m|
75
+ Dir[File.join("**/#{m[1]}.feature")][0] || "spec/acceptance"
76
+ end
77
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Adam Pash
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # Textract
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'textract'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install textract
20
+
21
+ ## Usage
22
+
23
+ TODO: Write usage instructions here
24
+
25
+ ## Contributing
26
+
27
+ 1. Fork it ( https://github.com/[my-github-username]/textract/fork )
28
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
29
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
30
+ 4. Push to the branch (`git push origin my-new-feature`)
31
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
data/lib/textract.rb ADDED
@@ -0,0 +1,81 @@
1
+ require "textract/version"
2
+ require 'httparty'
3
+ require 'nokogiri'
4
+ require 'opengraph_parser'
5
+ require 'reverse_markdown'
6
+ require 'readability'
7
+
8
+ module Textract
9
+ # attr_accessor :client
10
+
11
+ def self.get_text(url, selectors=nil)
12
+ @client = Client.new(url, selectors)
13
+ end
14
+
15
+ def self.get_og_tags(html)
16
+ OpenGraph.new(html)
17
+ end
18
+
19
+ def self.get_text_from_description(html, description, selectors)
20
+ doc = Nokogiri::HTML html
21
+ if selectors.nil?
22
+ article = doc.search('article')
23
+ else
24
+ article = doc.search(selectors)
25
+ end
26
+ if article.count == 1
27
+ article_el = article[0]
28
+ elsif !description.nil? and article.count == 0
29
+ els = [1,2,3]
30
+ i = 1
31
+ until els.count < 2
32
+ search_text = description.split(" ")[0..i].join(" ")
33
+ puts search_text
34
+ els = doc.search "[text()*='#{search_text}']"
35
+ i += 1
36
+ end
37
+ if els.count == 1
38
+ el = els[0]
39
+ article_el = el.parent
40
+ else
41
+ # do something else if multiple or no matches
42
+ end
43
+ else
44
+ article_el = doc
45
+ end
46
+ article = Readability::Document.new(article_el.to_s,
47
+ tags: %w[div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong],
48
+ attributes: %w[src href],
49
+ remove_empty_nodes: true,
50
+ ).content
51
+ markdown = ReverseMarkdown.convert article, unknown_tags: :bypass
52
+ # TODO change to drop once article is supported by reversemarkdown
53
+ markdown
54
+ end
55
+
56
+ def self.get_page_title(html)
57
+ Nokogiri::HTML(html).search('title').text
58
+ end
59
+
60
+ class Client
61
+ attr_reader :html
62
+ attr_reader :url
63
+ attr_reader :tags
64
+ attr_reader :title
65
+ attr_reader :text
66
+
67
+ def initialize(url, selectors)
68
+ @url = url
69
+ @html = HTTParty.get url
70
+ @tags = Textract.get_og_tags(@html)
71
+ if @tags.nil? or @tags.description.nil?
72
+ # use readability method
73
+ @text = Textract.get_text_from_description(@html, nil, selectors)
74
+ @title = Textract.get_page_title(@html)
75
+ else
76
+ @text = Textract.get_text_from_description(@html, @tags.description, selectors)
77
+ @title = @tags.title
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,3 @@
1
+ module Textract
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,33 @@
1
+ require_relative '../../lib/textract'
2
+ describe Textract do
3
+ it "initializes with the get_text method" do
4
+ url = "http://www.tedcruz.org/about/"
5
+ textract = Textract.get_text(url)
6
+ expect(textract).to be_a_kind_of Textract::Client
7
+ end
8
+
9
+ it "returns article text based on article tag" do
10
+ url = "http://gawker.com/1694508525"
11
+ textract = Textract.get_text(url)
12
+ expect(textract.text[0..5]).to eq "Import"
13
+ end
14
+
15
+ it "returns article text based on opengraph description" do
16
+ url = "http://www.tedcruz.org/record/our-standard-the-constitution/"
17
+ textract = Textract.get_text(url)
18
+ expect(textract.text[0..5]).to eq "Ted Cr"
19
+ end
20
+
21
+ it "can find a twitter profile given a selector" do
22
+ url = "https://twitter.com/lifehacker"
23
+ textract = Textract.get_text(url, 'p.ProfileHeaderCard-bio.u-dir')
24
+ expect(textract.text.strip).to eq "Don't live to geek; geek to live."
25
+ expect(textract.title).to eq "Lifehacker (@lifehacker) | Twitter"
26
+ end
27
+
28
+ it "gets the page title from the title tag" do
29
+ html = "<html><head><title>Stuff</title></head><body><h1>FOO!</h1></body></html>"
30
+ expect(Textract.get_page_title(html)).to eq "Stuff"
31
+ end
32
+
33
+ end
data/textract.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'textract/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "textract"
8
+ spec.version = Textract::VERSION
9
+ spec.authors = ["Adam Pash"]
10
+ spec.email = ["adam.pash@gmail.com"]
11
+ spec.summary = %q{Extracts article text from a URL}
12
+ spec.description = %q{Extracts article text from a URL}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "opengraph_parser"
22
+ spec.add_dependency "httparty"
23
+ spec.add_dependency "reverse_markdown"
24
+ spec.add_dependency "ruby-readability"
25
+
26
+ spec.add_development_dependency "bundler", "~> 1.7"
27
+ spec.add_development_dependency "rake", "~> 10.0"
28
+ spec.add_development_dependency "rspec"
29
+ spec.add_development_dependency "guard-rspec"
30
+ end
metadata ADDED
@@ -0,0 +1,167 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: textract
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Adam Pash
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-30 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: opengraph_parser
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: httparty
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: reverse_markdown
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: ruby-readability
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.7'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.7'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '10.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '10.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: guard-rspec
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description: Extracts article text from a URL
126
+ email:
127
+ - adam.pash@gmail.com
128
+ executables: []
129
+ extensions: []
130
+ extra_rdoc_files: []
131
+ files:
132
+ - ".gitignore"
133
+ - Gemfile
134
+ - Guardfile
135
+ - LICENSE.txt
136
+ - README.md
137
+ - Rakefile
138
+ - lib/textract.rb
139
+ - lib/textract/version.rb
140
+ - spec/lib/textract_spec.rb
141
+ - textract.gemspec
142
+ homepage: ''
143
+ licenses:
144
+ - MIT
145
+ metadata: {}
146
+ post_install_message:
147
+ rdoc_options: []
148
+ require_paths:
149
+ - lib
150
+ required_ruby_version: !ruby/object:Gem::Requirement
151
+ requirements:
152
+ - - ">="
153
+ - !ruby/object:Gem::Version
154
+ version: '0'
155
+ required_rubygems_version: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ requirements: []
161
+ rubyforge_project:
162
+ rubygems_version: 2.4.5
163
+ signing_key:
164
+ specification_version: 4
165
+ summary: Extracts article text from a URL
166
+ test_files:
167
+ - spec/lib/textract_spec.rb