exquisite_corpus 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0746301260233dd47c8a66d37ce3eccb9808c12c
4
+ data.tar.gz: 158b58c581444b0d34d73bb2ea879adfc08d4ca6
5
+ SHA512:
6
+ metadata.gz: af53c05a6da730fd32c3ee2107a681cba2a5931f36934b767ab25c3590b16969d3db04006600e388c6b6c292f01d4bb9499d1367820b713f20c4a5290a6febd3
7
+ data.tar.gz: 1ac765bfe2d8d798743de4aa9d1bb048ed7c64c73fb5e35ac82d01a669eb50d34cbafa90b475092d74f8a1768c624d8310dff0c2e08354793800c6aac8d8e777
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --require spec_helper
data/.ruby-gemset ADDED
@@ -0,0 +1 @@
1
+ exquisite_corpus
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.3.1
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in exquisite_corpus.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,67 @@
1
+ # Exquisite Corpus
2
+
3
+ Scrape sites, feeds, & files.
4
+ Turn them to 'just text' so you can feed them to the AI gawd.
5
+ **You can parse any type of feed supported by [feedjira](http://feedjira.com/).**
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'exquisite_corpus'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install exquisite_corpus
22
+
23
+ ## Usage
24
+
25
+
26
+ ### Basic
27
+
28
+ ```ruby
29
+
30
+ parser = ExquisiteCorpus.new(inputs: [
31
+ {source: 'http://example.com'},
32
+ {source: 'https://example.com/feed'},
33
+ {source: '/path/to/local/file.html'}
34
+ ])
35
+
36
+ parser.parse!
37
+
38
+ # A plain-text of example.com's markup
39
+ parser.results.first.content
40
+
41
+ # Want all the text?
42
+ all_content = parser.results.map(&:content)
43
+
44
+ # Export a directory of text files
45
+ parser.export_to('/a/new/directory')
46
+ ```
47
+
48
+ ### Don't Scrape Dumb Stuff
49
+
50
+ This library **refuses to scrape** `script`, `form`, `input`, `style` tags.
51
+ Want to exclude more?:
52
+
53
+ ```ruby
54
+ parser = ExquisiteCorpus.new(inputs: [
55
+ {source: 'http://example.com', except: ['array', 'of', 'cssSelectors']}
56
+ ])
57
+ ```
58
+
59
+ ## Development
60
+
61
+ After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
62
+
63
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
64
+
65
+ ## Contributing
66
+
67
+ Bug reports and pull requests are welcome on GitHub at https://github.com/nodanaonlyzuul/exquisite_corpus.
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "exquisite_corpus"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,39 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'exquisite_corpus/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "exquisite_corpus"
8
+ spec.version = ExquisiteCorpus::VERSION
9
+ spec.authors = ["nodanaonlyzuul"]
10
+ spec.email = ["beholdthepanda@gmail.com"]
11
+
12
+ spec.summary = %q{Scrape sites, feeds, and files. Feed them to the AI gawd.}
13
+ spec.homepage = "https://github.com/nodanaonlyzuul/exquisite_corpus"
14
+
15
+ # # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
16
+ # # to allow pushing to a single host or delete this section to allow pushing to any host.
17
+ # if spec.respond_to?(:metadata)
18
+ # spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
19
+ # else
20
+ # raise "RubyGems 2.0 or newer is required to protect against " \
21
+ # "public gem pushes."
22
+ # end
23
+
24
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
25
+ f.match(%r{^(test|spec|features)/})
26
+ end
27
+ spec.bindir = "exe"
28
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ["lib"]
30
+
31
+ spec.add_dependency 'feedjira', '~> 2.0'
32
+ spec.add_dependency 'marky_markov', '~> 0.3.5'
33
+ spec.add_development_dependency 'webmock', '~> 3.0', '>= 3.0.1'
34
+ spec.add_development_dependency 'rspec', '~> 3.6'
35
+ spec.add_development_dependency 'bundler', '~> 1.13'
36
+ spec.add_development_dependency 'rake', '~> 10.0'
37
+ spec.add_development_dependency 'pry', '~> 0.10.4'
38
+ spec.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.3'
39
+ end
@@ -0,0 +1,89 @@
1
+ require 'rubygems'
2
+ require 'open-uri'
3
+ require 'feedjira'
4
+ require 'nokogiri'
5
+ require 'fileutils'
6
+
7
+ require File.join(__dir__, 'result')
8
+
9
+ class ExquisiteCorpus
10
+ attr_reader :results
11
+
12
+ REQUIRED_ARGS = [:inputs]
13
+ STRIPPED_TAGS = %w{script form input style}
14
+
15
+ def initialize(options = {})
16
+ check_for_required_args(options)
17
+ @inputs = options[:inputs]
18
+ @results = []
19
+ end
20
+
21
+ def parse!
22
+ @inputs.each do |input|
23
+ response = open(input[:source]).read
24
+ parse_feed(input, response) || parse_html(input, response)
25
+ end
26
+ end
27
+
28
+ def export_to(path)
29
+ unless @results.empty?
30
+ path = File.expand_path(path)
31
+ FileUtils.mkdir_p(path)
32
+ @results.each do |result|
33
+ File.write(File.join(path, result.exported_as), result.content)
34
+ end
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def parse_feed(input, response)
41
+ begin
42
+ feed = Feedjira::Feed.parse(response)
43
+
44
+ feed.entries.each do |entry|
45
+ content = Nokogiri::HTML(entry.content).css('body')
46
+ @results << Result.new(
47
+ source: input[:source],
48
+ content: content.text()
49
+ )
50
+ end
51
+ rescue
52
+ false
53
+ end
54
+ end
55
+
56
+ def parse_html(input, response)
57
+ begin
58
+ response = Nokogiri::HTML(response).css('body')
59
+ strip_tags!(response, input)
60
+
61
+ @results << Result.new(
62
+ source: input[:source],
63
+ content: response.text()
64
+ )
65
+ rescue Exception => e
66
+ require 'pry'; binding.pry
67
+ false
68
+ end
69
+ end
70
+
71
+ def check_for_required_args(args)
72
+ REQUIRED_ARGS.each do |required_arg|
73
+ unless args[required_arg]
74
+ throw "#{required_arg} is required"
75
+ end
76
+ end
77
+ end
78
+
79
+ def strip_tags!(document, input)
80
+ strip_except_options(document, STRIPPED_TAGS.push(input[:except]).compact.flatten)
81
+ end
82
+
83
+ def strip_except_options(document, css_selectors)
84
+ css_selectors.each do |css_selector|
85
+ document.search(css_selector).remove
86
+ end
87
+ end
88
+
89
+ end
@@ -0,0 +1,3 @@
1
+ class ExquisiteCorpus
2
+ VERSION = "0.1.0"
3
+ end
data/lib/result.rb ADDED
@@ -0,0 +1,44 @@
1
+ require 'uri'
2
+ require 'digest'
3
+
4
+ class Result
5
+
6
+ attr_reader :source, :content
7
+
8
+ def initialize(options = {})
9
+ @source = options[:source]
10
+ @content = options[:content]
11
+ end
12
+
13
+ def exported_as
14
+ if @source
15
+
16
+ if source_is_url?
17
+ file_basename = output_from_url(@source)
18
+ else
19
+ file_basename = "#{File.basename(@source, '.*')}"
20
+ end
21
+
22
+ "#{file_basename}-#{rand(900000)}.txt"
23
+ end
24
+ end
25
+
26
+ private
27
+
28
+ def output_from_url(source)
29
+ uri = URI.parse(source)
30
+ output_file = uri.host
31
+
32
+ unless uri.path.empty?
33
+ output_file = "#{output_file}.#{File.basename(uri.path, '.*')}"
34
+ end
35
+
36
+ "#{output_file}.txt"
37
+ end
38
+
39
+ def source_is_url?
40
+ @uri = URI.parse(@source)
41
+ %w( http https ).include?(@uri.scheme)
42
+ end
43
+
44
+ end
metadata ADDED
@@ -0,0 +1,180 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: exquisite_corpus
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - nodanaonlyzuul
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2017-06-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: feedjira
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: marky_markov
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.3.5
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 0.3.5
41
+ - !ruby/object:Gem::Dependency
42
+ name: webmock
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: 3.0.1
51
+ type: :development
52
+ prerelease: false
53
+ version_requirements: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - "~>"
56
+ - !ruby/object:Gem::Version
57
+ version: '3.0'
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: 3.0.1
61
+ - !ruby/object:Gem::Dependency
62
+ name: rspec
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '3.6'
68
+ type: :development
69
+ prerelease: false
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '3.6'
75
+ - !ruby/object:Gem::Dependency
76
+ name: bundler
77
+ requirement: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '1.13'
82
+ type: :development
83
+ prerelease: false
84
+ version_requirements: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '1.13'
89
+ - !ruby/object:Gem::Dependency
90
+ name: rake
91
+ requirement: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - "~>"
94
+ - !ruby/object:Gem::Version
95
+ version: '10.0'
96
+ type: :development
97
+ prerelease: false
98
+ version_requirements: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - "~>"
101
+ - !ruby/object:Gem::Version
102
+ version: '10.0'
103
+ - !ruby/object:Gem::Dependency
104
+ name: pry
105
+ requirement: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: 0.10.4
110
+ type: :development
111
+ prerelease: false
112
+ version_requirements: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - "~>"
115
+ - !ruby/object:Gem::Version
116
+ version: 0.10.4
117
+ - !ruby/object:Gem::Dependency
118
+ name: vcr
119
+ requirement: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - "~>"
122
+ - !ruby/object:Gem::Version
123
+ version: '3.0'
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ version: 3.0.3
127
+ type: :development
128
+ prerelease: false
129
+ version_requirements: !ruby/object:Gem::Requirement
130
+ requirements:
131
+ - - "~>"
132
+ - !ruby/object:Gem::Version
133
+ version: '3.0'
134
+ - - ">="
135
+ - !ruby/object:Gem::Version
136
+ version: 3.0.3
137
+ description:
138
+ email:
139
+ - beholdthepanda@gmail.com
140
+ executables: []
141
+ extensions: []
142
+ extra_rdoc_files: []
143
+ files:
144
+ - ".gitignore"
145
+ - ".rspec"
146
+ - ".ruby-gemset"
147
+ - ".ruby-version"
148
+ - Gemfile
149
+ - README.md
150
+ - Rakefile
151
+ - bin/console
152
+ - bin/setup
153
+ - exquisite_corpus.gemspec
154
+ - lib/exquisite_corpus.rb
155
+ - lib/exquisite_corpus/version.rb
156
+ - lib/result.rb
157
+ homepage: https://github.com/nodanaonlyzuul/exquisite_corpus
158
+ licenses: []
159
+ metadata: {}
160
+ post_install_message:
161
+ rdoc_options: []
162
+ require_paths:
163
+ - lib
164
+ required_ruby_version: !ruby/object:Gem::Requirement
165
+ requirements:
166
+ - - ">="
167
+ - !ruby/object:Gem::Version
168
+ version: '0'
169
+ required_rubygems_version: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ requirements: []
175
+ rubyforge_project:
176
+ rubygems_version: 2.5.1
177
+ signing_key:
178
+ specification_version: 4
179
+ summary: Scrape sites, feeds, and files. Feed them to the AI gawd.
180
+ test_files: []