exquisite_corpus 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0746301260233dd47c8a66d37ce3eccb9808c12c
4
+ data.tar.gz: 158b58c581444b0d34d73bb2ea879adfc08d4ca6
5
+ SHA512:
6
+ metadata.gz: af53c05a6da730fd32c3ee2107a681cba2a5931f36934b767ab25c3590b16969d3db04006600e388c6b6c292f01d4bb9499d1367820b713f20c4a5290a6febd3
7
+ data.tar.gz: 1ac765bfe2d8d798743de4aa9d1bb048ed7c64c73fb5e35ac82d01a669eb50d34cbafa90b475092d74f8a1768c624d8310dff0c2e08354793800c6aac8d8e777
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --require spec_helper
data/.ruby-gemset ADDED
@@ -0,0 +1 @@
1
+ exquisite_corpus
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.3.1
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in exquisite_corpus.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,67 @@
1
+ # Exquisite Corpus
2
+
3
+ Scrape sites, feeds, & files.
4
+ Turn them to 'just text' so you can feed them to the AI gawd.
5
+ **You can parse any type of feed supported by [feedjira](http://feedjira.com/).**
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'exquisite_corpus'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install exquisite_corpus
22
+
23
+ ## Usage
24
+
25
+
26
+ ### Basic
27
+
28
+ ```ruby
29
+
30
+ parser = ExquisiteCorpus.new(inputs: [
31
+ {source: 'http://example.com'},
32
+ {source: 'https://example.com/feed'},
33
+ {source: '/path/to/local/file.html'}
34
+ ])
35
+
36
+ parser.parse!
37
+
38
+ # A plain-text of example.com's markup
39
+ parser.results.first.content
40
+
41
+ # Want all the text?
42
+ all_content = parser.results.map(&:content)
43
+
44
+ # Export a directory of text files
45
+ parser.export_to('/a/new/directory')
46
+ ```
47
+
48
+ ### Don't Scrape Dumb Stuff
49
+
50
+ This library **refuses to scrape** `script`, `form`, `input`, `style` tags.
51
+ Want to exclude more?:
52
+
53
+ ```ruby
54
+ parser = ExquisiteCorpus.new(inputs: [
55
+ {source: 'http://example.com', except: ['array', 'of', 'cssSelectors']}
56
+ ])
57
+ ```
58
+
59
+ ## Development
60
+
61
+ After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
62
+
63
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
64
+
65
+ ## Contributing
66
+
67
+ Bug reports and pull requests are welcome on GitHub at https://github.com/nodanaonlyzuul/exquisite_corpus.
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "exquisite_corpus"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,39 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'exquisite_corpus/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "exquisite_corpus"
8
+ spec.version = ExquisiteCorpus::VERSION
9
+ spec.authors = ["nodanaonlyzuul"]
10
+ spec.email = ["beholdthepanda@gmail.com"]
11
+
12
+ spec.summary = %q{Scrape sites, feeds, and files. Feed them to the AI gawd.}
13
+ spec.homepage = "https://github.com/nodanaonlyzuul/exquisite_corpus"
14
+
15
+ # # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
16
+ # # to allow pushing to a single host or delete this section to allow pushing to any host.
17
+ # if spec.respond_to?(:metadata)
18
+ # spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
19
+ # else
20
+ # raise "RubyGems 2.0 or newer is required to protect against " \
21
+ # "public gem pushes."
22
+ # end
23
+
24
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
25
+ f.match(%r{^(test|spec|features)/})
26
+ end
27
+ spec.bindir = "exe"
28
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ["lib"]
30
+
31
+ spec.add_dependency 'feedjira', '~> 2.0'
32
+ spec.add_dependency 'marky_markov', '~> 0.3.5'
33
+ spec.add_development_dependency 'webmock', '~> 3.0', '>= 3.0.1'
34
+ spec.add_development_dependency 'rspec', '~> 3.6'
35
+ spec.add_development_dependency 'bundler', '~> 1.13'
36
+ spec.add_development_dependency 'rake', '~> 10.0'
37
+ spec.add_development_dependency 'pry', '~> 0.10.4'
38
+ spec.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.3'
39
+ end
@@ -0,0 +1,89 @@
1
+ require 'rubygems'
2
+ require 'open-uri'
3
+ require 'feedjira'
4
+ require 'nokogiri'
5
+ require 'fileutils'
6
+
7
+ require File.join(__dir__, 'result')
8
+
9
+ class ExquisiteCorpus
10
+ attr_reader :results
11
+
12
+ REQUIRED_ARGS = [:inputs]
13
+ STRIPPED_TAGS = %w{script form input style}
14
+
15
+ def initialize(options = {})
16
+ check_for_required_args(options)
17
+ @inputs = options[:inputs]
18
+ @results = []
19
+ end
20
+
21
+ def parse!
22
+ @inputs.each do |input|
23
+ response = open(input[:source]).read
24
+ parse_feed(input, response) || parse_html(input, response)
25
+ end
26
+ end
27
+
28
+ def export_to(path)
29
+ unless @results.empty?
30
+ path = File.expand_path(path)
31
+ FileUtils.mkdir_p(path)
32
+ @results.each do |result|
33
+ File.write(File.join(path, result.exported_as), result.content)
34
+ end
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def parse_feed(input, response)
41
+ begin
42
+ feed = Feedjira::Feed.parse(response)
43
+
44
+ feed.entries.each do |entry|
45
+ content = Nokogiri::HTML(entry.content).css('body')
46
+ @results << Result.new(
47
+ source: input[:source],
48
+ content: content.text()
49
+ )
50
+ end
51
+ rescue
52
+ false
53
+ end
54
+ end
55
+
56
+ def parse_html(input, response)
57
+ begin
58
+ response = Nokogiri::HTML(response).css('body')
59
+ strip_tags!(response, input)
60
+
61
+ @results << Result.new(
62
+ source: input[:source],
63
+ content: response.text()
64
+ )
65
+ rescue Exception => e
66
+ require 'pry'; binding.pry
67
+ false
68
+ end
69
+ end
70
+
71
+ def check_for_required_args(args)
72
+ REQUIRED_ARGS.each do |required_arg|
73
+ unless args[required_arg]
74
+ throw "#{required_arg} is required"
75
+ end
76
+ end
77
+ end
78
+
79
+ def strip_tags!(document, input)
80
+ strip_except_options(document, STRIPPED_TAGS.push(input[:except]).compact.flatten)
81
+ end
82
+
83
+ def strip_except_options(document, css_selectors)
84
+ css_selectors.each do |css_selector|
85
+ document.search(css_selector).remove
86
+ end
87
+ end
88
+
89
+ end
@@ -0,0 +1,3 @@
1
+ class ExquisiteCorpus
2
+ VERSION = "0.1.0"
3
+ end
data/lib/result.rb ADDED
@@ -0,0 +1,44 @@
1
+ require 'uri'
2
+ require 'digest'
3
+
4
+ class Result
5
+
6
+ attr_reader :source, :content
7
+
8
+ def initialize(options = {})
9
+ @source = options[:source]
10
+ @content = options[:content]
11
+ end
12
+
13
+ def exported_as
14
+ if @source
15
+
16
+ if source_is_url?
17
+ file_basename = output_from_url(@source)
18
+ else
19
+ file_basename = "#{File.basename(@source, '.*')}"
20
+ end
21
+
22
+ "#{file_basename}-#{rand(900000)}.txt"
23
+ end
24
+ end
25
+
26
+ private
27
+
28
+ def output_from_url(source)
29
+ uri = URI.parse(source)
30
+ output_file = uri.host
31
+
32
+ unless uri.path.empty?
33
+ output_file = "#{output_file}.#{File.basename(uri.path, '.*')}"
34
+ end
35
+
36
+ "#{output_file}.txt"
37
+ end
38
+
39
+ def source_is_url?
40
+ @uri = URI.parse(@source)
41
+ %w( http https ).include?(@uri.scheme)
42
+ end
43
+
44
+ end
metadata ADDED
@@ -0,0 +1,180 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: exquisite_corpus
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - nodanaonlyzuul
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2017-06-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: feedjira
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: marky_markov
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.3.5
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 0.3.5
41
+ - !ruby/object:Gem::Dependency
42
+ name: webmock
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: 3.0.1
51
+ type: :development
52
+ prerelease: false
53
+ version_requirements: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - "~>"
56
+ - !ruby/object:Gem::Version
57
+ version: '3.0'
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: 3.0.1
61
+ - !ruby/object:Gem::Dependency
62
+ name: rspec
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '3.6'
68
+ type: :development
69
+ prerelease: false
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '3.6'
75
+ - !ruby/object:Gem::Dependency
76
+ name: bundler
77
+ requirement: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '1.13'
82
+ type: :development
83
+ prerelease: false
84
+ version_requirements: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '1.13'
89
+ - !ruby/object:Gem::Dependency
90
+ name: rake
91
+ requirement: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - "~>"
94
+ - !ruby/object:Gem::Version
95
+ version: '10.0'
96
+ type: :development
97
+ prerelease: false
98
+ version_requirements: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - "~>"
101
+ - !ruby/object:Gem::Version
102
+ version: '10.0'
103
+ - !ruby/object:Gem::Dependency
104
+ name: pry
105
+ requirement: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: 0.10.4
110
+ type: :development
111
+ prerelease: false
112
+ version_requirements: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - "~>"
115
+ - !ruby/object:Gem::Version
116
+ version: 0.10.4
117
+ - !ruby/object:Gem::Dependency
118
+ name: vcr
119
+ requirement: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - "~>"
122
+ - !ruby/object:Gem::Version
123
+ version: '3.0'
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ version: 3.0.3
127
+ type: :development
128
+ prerelease: false
129
+ version_requirements: !ruby/object:Gem::Requirement
130
+ requirements:
131
+ - - "~>"
132
+ - !ruby/object:Gem::Version
133
+ version: '3.0'
134
+ - - ">="
135
+ - !ruby/object:Gem::Version
136
+ version: 3.0.3
137
+ description:
138
+ email:
139
+ - beholdthepanda@gmail.com
140
+ executables: []
141
+ extensions: []
142
+ extra_rdoc_files: []
143
+ files:
144
+ - ".gitignore"
145
+ - ".rspec"
146
+ - ".ruby-gemset"
147
+ - ".ruby-version"
148
+ - Gemfile
149
+ - README.md
150
+ - Rakefile
151
+ - bin/console
152
+ - bin/setup
153
+ - exquisite_corpus.gemspec
154
+ - lib/exquisite_corpus.rb
155
+ - lib/exquisite_corpus/version.rb
156
+ - lib/result.rb
157
+ homepage: https://github.com/nodanaonlyzuul/exquisite_corpus
158
+ licenses: []
159
+ metadata: {}
160
+ post_install_message:
161
+ rdoc_options: []
162
+ require_paths:
163
+ - lib
164
+ required_ruby_version: !ruby/object:Gem::Requirement
165
+ requirements:
166
+ - - ">="
167
+ - !ruby/object:Gem::Version
168
+ version: '0'
169
+ required_rubygems_version: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ requirements: []
175
+ rubyforge_project:
176
+ rubygems_version: 2.5.1
177
+ signing_key:
178
+ specification_version: 4
179
+ summary: Scrape sites, feeds, and files. Feed them to the AI gawd.
180
+ test_files: []