news_scraper 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rubocop.yml +96 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +21 -0
- data/README.md +105 -0
- data/Rakefile +24 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/circle.yml +3 -0
- data/config/article_scrape_patterns.yml +116 -0
- data/config/temp_dirs.yml +4 -0
- data/dev.yml +13 -0
- data/lib/news_scraper/active_support_lite/string.rb +11 -0
- data/lib/news_scraper/cli.rb +106 -0
- data/lib/news_scraper/constants.rb +6 -0
- data/lib/news_scraper/errors.rb +16 -0
- data/lib/news_scraper/extractors/article.rb +17 -0
- data/lib/news_scraper/extractors/google_news_rss.rb +41 -0
- data/lib/news_scraper/extractors_helpers.rb +27 -0
- data/lib/news_scraper/scraper.rb +42 -0
- data/lib/news_scraper/trainer/preset_selector.rb +77 -0
- data/lib/news_scraper/trainer/url_trainer.rb +74 -0
- data/lib/news_scraper/trainer.rb +25 -0
- data/lib/news_scraper/transformers/article.rb +77 -0
- data/lib/news_scraper/transformers/trainer_article.rb +17 -0
- data/lib/news_scraper/uri_parser.rb +41 -0
- data/lib/news_scraper/version.rb +3 -0
- data/lib/news_scraper.rb +42 -0
- data/news_scraper.gemspec +41 -0
- metadata +337 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cadeecdf95b6d5fd907671773fc3cc521fe756a2
|
4
|
+
data.tar.gz: 150d667251c00f96b01b195c99fe2a44f3da8da9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 8af8f251dce23589d5e08af5f9c0510d031b91ed56188a0e6b3418b26397ad9ba98835a516c8e174d8aed3ea791793e5ceec8c3422a6e7731ac84196b49f287b
|
7
|
+
data.tar.gz: d55a513397be97f08b0adcbc975db4ad1c39d0887576979b138a50b5cb3601eb03036a9cfe75d1b0d4dcd2ab0303256c9112c865749905e5540f35bdefef18eb
|
data/.gitignore
ADDED
data/.rubocop.yml
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
AllCops:
|
2
|
+
TargetRubyVersion: 2.3
|
3
|
+
|
4
|
+
ClassLength:
|
5
|
+
Max: 500
|
6
|
+
|
7
|
+
ModuleLength:
|
8
|
+
Max: 500
|
9
|
+
|
10
|
+
Rails:
|
11
|
+
Enabled: false
|
12
|
+
|
13
|
+
Lint/AssignmentInCondition:
|
14
|
+
Enabled: false
|
15
|
+
|
16
|
+
Style/Documentation:
|
17
|
+
Enabled: false
|
18
|
+
|
19
|
+
Style/MultilineOperationIndentation:
|
20
|
+
Enabled: true
|
21
|
+
|
22
|
+
Style/AlignParameters:
|
23
|
+
EnforcedStyle: with_fixed_indentation
|
24
|
+
|
25
|
+
Style/FirstParameterIndentation:
|
26
|
+
EnforcedStyle: consistent
|
27
|
+
|
28
|
+
Style/TrailingCommaInLiteral:
|
29
|
+
Enabled: false
|
30
|
+
|
31
|
+
Style/TrailingCommaInArguments:
|
32
|
+
Enabled: false
|
33
|
+
|
34
|
+
Style/SignalException:
|
35
|
+
EnforcedStyle: only_raise
|
36
|
+
|
37
|
+
Style/NumericLiterals:
|
38
|
+
Enabled: true
|
39
|
+
|
40
|
+
Style/CaseIndentation:
|
41
|
+
IndentWhenRelativeTo: end
|
42
|
+
|
43
|
+
Style/IndentHash:
|
44
|
+
EnforcedStyle: consistent
|
45
|
+
|
46
|
+
Style/WordArray:
|
47
|
+
Enabled: true
|
48
|
+
|
49
|
+
Style/ModuleFunction:
|
50
|
+
Enabled: false
|
51
|
+
|
52
|
+
Style/RaiseArgs:
|
53
|
+
EnforcedStyle: compact
|
54
|
+
|
55
|
+
Metrics/AbcSize:
|
56
|
+
Enabled: false
|
57
|
+
|
58
|
+
Metrics/CyclomaticComplexity:
|
59
|
+
Enabled: false
|
60
|
+
|
61
|
+
Style/StringLiterals:
|
62
|
+
Enabled: false
|
63
|
+
|
64
|
+
Metrics/LineLength:
|
65
|
+
Max: 120
|
66
|
+
|
67
|
+
Metrics/ClassLength:
|
68
|
+
Enabled: false
|
69
|
+
|
70
|
+
Metrics/MethodLength:
|
71
|
+
Enabled: false
|
72
|
+
|
73
|
+
Metrics/ParameterLists:
|
74
|
+
Max: 5
|
75
|
+
CountKeywordArgs: false
|
76
|
+
|
77
|
+
Metrics/PerceivedComplexity:
|
78
|
+
Enabled: false
|
79
|
+
|
80
|
+
Lint/EndAlignment:
|
81
|
+
AlignWith: variable
|
82
|
+
|
83
|
+
Style/FrozenStringLiteralComment:
|
84
|
+
Enabled: false
|
85
|
+
|
86
|
+
Style/Alias:
|
87
|
+
EnforcedStyle: prefer_alias_method
|
88
|
+
|
89
|
+
Style/MutableConstant:
|
90
|
+
Enabled: true
|
91
|
+
|
92
|
+
Performance/Casecmp:
|
93
|
+
Enabled: true
|
94
|
+
|
95
|
+
Style/GuardClause:
|
96
|
+
Enabled: true
|
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# Contributor Code of Conduct
|
2
|
+
|
3
|
+
As contributors and maintainers of this project, and in the interest of
|
4
|
+
fostering an open and welcoming community, we pledge to respect all people who
|
5
|
+
contribute through reporting issues, posting feature requests, updating
|
6
|
+
documentation, submitting pull requests or patches, and other activities.
|
7
|
+
|
8
|
+
We are committed to making participation in this project a harassment-free
|
9
|
+
experience for everyone, regardless of level of experience, gender, gender
|
10
|
+
identity and expression, sexual orientation, disability, personal appearance,
|
11
|
+
body size, race, ethnicity, age, religion, or nationality.
|
12
|
+
|
13
|
+
Examples of unacceptable behavior by participants include:
|
14
|
+
|
15
|
+
* The use of sexualized language or imagery
|
16
|
+
* Personal attacks
|
17
|
+
* Trolling or insulting/derogatory comments
|
18
|
+
* Public or private harassment
|
19
|
+
* Publishing other's private information, such as physical or electronic
|
20
|
+
addresses, without explicit permission
|
21
|
+
* Other unethical or unprofessional conduct
|
22
|
+
|
23
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
24
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
25
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
26
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
27
|
+
threatening, offensive, or harmful.
|
28
|
+
|
29
|
+
By adopting this Code of Conduct, project maintainers commit themselves to
|
30
|
+
fairly and consistently applying these principles to every aspect of managing
|
31
|
+
this project. Project maintainers who do not follow or enforce the Code of
|
32
|
+
Conduct may be permanently removed from the project team.
|
33
|
+
|
34
|
+
This code of conduct applies both within project spaces and in public spaces
|
35
|
+
when an individual is representing the project or its community.
|
36
|
+
|
37
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
38
|
+
reported by contacting a project maintainer at richardwu1997@gmail.com. All
|
39
|
+
complaints will be reviewed and investigated and will result in a response that
|
40
|
+
is deemed necessary and appropriate to the circumstances. Maintainers are
|
41
|
+
obligated to maintain confidentiality with regard to the reporter of an
|
42
|
+
incident.
|
43
|
+
|
44
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
45
|
+
version 1.3.0, available at
|
46
|
+
[http://contributor-covenant.org/version/1/3/0/][version]
|
47
|
+
|
48
|
+
[homepage]: http://contributor-covenant.org
|
49
|
+
[version]: http://contributor-covenant.org/version/1/3/0/
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2016 Richard Wu
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
# NewsScraper
|
2
|
+
|
3
|
+
### Simple ETL news scraper in Ruby
|
4
|
+
|
5
|
+
[RubyGems](https://rubygems.org/gems/news_scraper)
|
6
|
+
|
7
|
+
A collection of extractors, transformers and loaders for a variety of news feeds and outlets.
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
Add this line to your application's Gemfile:
|
12
|
+
|
13
|
+
```ruby
|
14
|
+
gem 'news_scraper'
|
15
|
+
```
|
16
|
+
|
17
|
+
And then execute:
|
18
|
+
|
19
|
+
$ bundle
|
20
|
+
|
21
|
+
Or install it yourself as:
|
22
|
+
|
23
|
+
$ gem install news_scraper
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
### Scraping
|
28
|
+
|
29
|
+
`NewsScraper::Scraper#scrape` will return an array of the transformed data for all Google News RSS articles for the given query.
|
30
|
+
|
31
|
+
Optionally, you can pass in a block and it will yield the transformed data on a per-article basis.
|
32
|
+
|
33
|
+
It takes in 1 parameter `query:`.
|
34
|
+
|
35
|
+
Array notation
|
36
|
+
```
|
37
|
+
article_hashes = NewsScraper::Scraper.new(query: 'Shopify').scrape # [ { author: ... }, { author: ... } ... ]
|
38
|
+
```
|
39
|
+
|
40
|
+
Block notation
|
41
|
+
```
|
42
|
+
NewsScraper::Scraper.new(query: 'Shopify').scrape do |article_hash|
|
43
|
+
# { author: ... }
|
44
|
+
end
|
45
|
+
```
|
46
|
+
|
47
|
+
How the `Scraper` extracts and parses for the information is determined by scrape patterns (see **Scrape Patterns**).
|
48
|
+
|
49
|
+
### Transformed Data
|
50
|
+
|
51
|
+
Calling `NewsScraper::Scraper#scrape` with either the array or block notation will yield `transformed_data` hashes. [`article_scrape_patterns.yml`](https://github.com/richardwu/news_scraper/blob/master/config/article_scrape_patterns.yml) defines the data types that will be scraped for.
|
52
|
+
|
53
|
+
In addition, the `uri` and `root_domain`(hostname) of the article will be returned in the hash too.
|
54
|
+
|
55
|
+
Example
|
56
|
+
```
|
57
|
+
{
|
58
|
+
author: 'Linus Torvald',
|
59
|
+
body: 'The Linux kernel developed by Linus Torvald has become the backbone of most electronic devices we use to-date. It powers mobile phones, laptops, embedded devices, and even rockets...',
|
60
|
+
description: 'The Linux kernel is one of the most important contributions to the world of technology.',
|
61
|
+
keywords: 'linux,kernel,linus,torvald',
|
62
|
+
section: 'technology',
|
63
|
+
datetime: '1991-10-05T12:00:00+00:00',
|
64
|
+
title: 'Linus Linux',
|
65
|
+
uri: 'linusworld.com/the-linux-kernel',
|
66
|
+
root_domain: 'linusworld.com'
|
67
|
+
}
|
68
|
+
```
|
69
|
+
|
70
|
+
### Scrape Patterns
|
71
|
+
|
72
|
+
Scrape patterns are xpath or CSS patterns used by Nokogiri to extract relevant HTML elements.
|
73
|
+
|
74
|
+
Extracting each `:data_type` (see Example under **Transformed Data**) requires a scrape pattern. A few `:presets` are specified in [`article_scrape_patterns.yml`](https://github.com/richardwu/news_scraper/blob/master/config/article_scrape_patterns.yml).
|
75
|
+
|
76
|
+
Since each news site (identified with `:root_domain`) uses a different markup, scrape patterns are defined on a per-`:root_domain` basis.
|
77
|
+
|
78
|
+
Specifying scrape patterns for new, undefined `:root_domains` is called training (see **Training**).
|
79
|
+
|
80
|
+
### Training
|
81
|
+
|
82
|
+
For each `:root_domain`, it is neccesary to specify a scrape pattern for each of the `:data_type`s. A rake task was written to provide a CLI for appending new `:root_domain`s using `:preset` scrape patterns.
|
83
|
+
|
84
|
+
Simply run
|
85
|
+
```
|
86
|
+
bundle exec rake scraper:train QUERY=<query>
|
87
|
+
```
|
88
|
+
|
89
|
+
where the CLI will step through the articles and `:root_domain`s of the articles relevant to `<query>`.
|
90
|
+
|
91
|
+
## Development
|
92
|
+
|
93
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
94
|
+
|
95
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
96
|
+
|
97
|
+
## Contributing
|
98
|
+
|
99
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/richardwu/news_scraper. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
100
|
+
|
101
|
+
|
102
|
+
## License
|
103
|
+
|
104
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
105
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'bundler/gem_tasks'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'rdoc/task'
|
4
|
+
|
5
|
+
require 'news_scraper'
|
6
|
+
|
7
|
+
Rake::TestTask.new do |t|
|
8
|
+
t.libs << 'test'
|
9
|
+
t.pattern = 'test/**/*_test.rb'
|
10
|
+
end
|
11
|
+
|
12
|
+
namespace :scraper do
|
13
|
+
desc 'CLI that steps through articles for a given query and displays preset scrape pattern results; parameters: QUERY'
|
14
|
+
task :train do
|
15
|
+
raise "QUERY param not given.\n\tUsage: bundle exec rake scraper:train QUERY=<query>" unless ENV['QUERY']
|
16
|
+
NewsScraper::Trainer.train(query: ENV['QUERY'])
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
RDoc::Task.new do |rdoc|
|
21
|
+
rdoc.main = "README.md"
|
22
|
+
rdoc.rdoc_files.include("README.md", "lib/*.rb", "lib/**/*.rb")
|
23
|
+
rdoc.rdoc_dir = "doc"
|
24
|
+
end
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "news_scraper"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
data/circle.yml
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
# All domains should include the scrape method/pattern for data_types
|
2
|
+
|
3
|
+
data_types:
|
4
|
+
- "author"
|
5
|
+
- "body"
|
6
|
+
- "description"
|
7
|
+
- "keywords"
|
8
|
+
- "section"
|
9
|
+
- "datetime"
|
10
|
+
- "title"
|
11
|
+
|
12
|
+
# All data types must include :method and :pattern
|
13
|
+
#
|
14
|
+
presets:
|
15
|
+
author:
|
16
|
+
class: &class_author
|
17
|
+
method: css
|
18
|
+
pattern: ".author"
|
19
|
+
id: &id_author
|
20
|
+
method: css
|
21
|
+
pattern: "#author"
|
22
|
+
name: &name_author
|
23
|
+
method: css
|
24
|
+
pattern: ".author-name"
|
25
|
+
link: &link_author
|
26
|
+
method: xpath
|
27
|
+
pattern: "//a[contains(@href, 'author')]"
|
28
|
+
meta: &meta_author
|
29
|
+
method: xpath
|
30
|
+
pattern: "//meta[@name='author']/@content"
|
31
|
+
rel_link: &rel_link_author
|
32
|
+
method: xpath
|
33
|
+
pattern: "//a[@rel='author']"
|
34
|
+
vcard: &vcard_author
|
35
|
+
method: css
|
36
|
+
pattern: ".vcard .fn"
|
37
|
+
body:
|
38
|
+
readability: &readability_body
|
39
|
+
method: "readability"
|
40
|
+
pattern: ""
|
41
|
+
description:
|
42
|
+
meta: &meta_description
|
43
|
+
method: "xpath"
|
44
|
+
pattern: "//meta[@name='description']/@content"
|
45
|
+
og: &og_description
|
46
|
+
method: "xpath"
|
47
|
+
pattern: "//meta[@property='og:description']/@content"
|
48
|
+
keywords:
|
49
|
+
meta: &meta_keywords
|
50
|
+
method: "xpath"
|
51
|
+
pattern: "//meta[@name='keywords']/@content"
|
52
|
+
article_tag: &article_tag_keywords
|
53
|
+
method: "xpath"
|
54
|
+
pattern: "//meta[@property='article:tag']/@content"
|
55
|
+
section:
|
56
|
+
meta: &meta_section
|
57
|
+
method: "xpath"
|
58
|
+
pattern: "//meta[@property='article:section']/@content"
|
59
|
+
datetime:
|
60
|
+
article_date_original: &article_date_original_datetime
|
61
|
+
method: xpath
|
62
|
+
pattern: //meta[@name='article_date_original']/@content
|
63
|
+
article_published_time: &article_published_time_datetime
|
64
|
+
method: "xpath"
|
65
|
+
pattern: "//meta[@property='article:published_time']/@content"
|
66
|
+
date: &date_datetime
|
67
|
+
method: xpath
|
68
|
+
pattern: //meta[@name='date']/@content
|
69
|
+
date_published: &date_published_datetime
|
70
|
+
method: xpath
|
71
|
+
pattern: //*[@itemprop='datePublished']/@datetime
|
72
|
+
og_published_time: &og_published_time_datetime
|
73
|
+
method: xpath
|
74
|
+
pattern: //meta[@property='og:published_time']/@content
|
75
|
+
original_publication_date: &original_publication_date_datetime
|
76
|
+
method: xpath
|
77
|
+
pattern: //meta[@name='OriginalPublicationDate']/@content
|
78
|
+
publication_date: &publication_date_datetime
|
79
|
+
method: xpath
|
80
|
+
pattern: //meta[@name='publication_date']/@content
|
81
|
+
publish_date: &publish_date_datetime
|
82
|
+
method: xpath
|
83
|
+
pattern: //meta[@name='PublishDate']/@content
|
84
|
+
rnews_date_published: &rnews_date_published_datetime
|
85
|
+
method: xpath
|
86
|
+
pattern: //meta[@property='rnews:datePublished']/@content
|
87
|
+
sailthru_date: &sailthru_date_datetime
|
88
|
+
method: xpath
|
89
|
+
pattern: //meta[@name='sailthru.date']/@content
|
90
|
+
title:
|
91
|
+
html: &html_title
|
92
|
+
method: "xpath"
|
93
|
+
pattern: "//title"
|
94
|
+
og: &og_title
|
95
|
+
method: "xpath"
|
96
|
+
pattern: "//meta[@property='og:title']/@content"
|
97
|
+
|
98
|
+
domains:
|
99
|
+
investors.com:
|
100
|
+
author: *rel_link_author
|
101
|
+
body:
|
102
|
+
method: "css"
|
103
|
+
pattern: ".single-post-content"
|
104
|
+
description: *og_description
|
105
|
+
keywords: *meta_keywords
|
106
|
+
section: *meta_section
|
107
|
+
datetime: *article_published_time_datetime
|
108
|
+
title: *og_title
|
109
|
+
fool.com:
|
110
|
+
author: *meta_author
|
111
|
+
body: *readability_body
|
112
|
+
description: *meta_description
|
113
|
+
keywords: *article_tag_keywords
|
114
|
+
section: *meta_section
|
115
|
+
datetime: *date_datetime
|
116
|
+
title: *og_title
|
data/dev.yml
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
require 'readline'
|
2
|
+
|
3
|
+
module NewsScraper
|
4
|
+
module CLI
|
5
|
+
extend self
|
6
|
+
|
7
|
+
DEFAULT_COLOR = "\x1b[36m".freeze
|
8
|
+
|
9
|
+
def log(message, color: DEFAULT_COLOR, new_line: false)
|
10
|
+
message += "\n" if new_line
|
11
|
+
$stdout.puts "#{color}┃\x1b[0m " + message
|
12
|
+
end
|
13
|
+
|
14
|
+
def log_lines(message, color: DEFAULT_COLOR, new_line: false)
|
15
|
+
message.split("\n").each do |line|
|
16
|
+
log(line, color: color, new_line: new_line)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def confirm(msg, color: DEFAULT_COLOR)
|
21
|
+
print "#{color}┃\x1b[0m #{msg} (y/n) "
|
22
|
+
$stdin.gets.chomp =~ /[Yy]/
|
23
|
+
end
|
24
|
+
|
25
|
+
def get_input(msg = nil)
|
26
|
+
log(msg) if msg
|
27
|
+
Readline.completion_append_character = " "
|
28
|
+
Readline.completion_proc = nil
|
29
|
+
result = begin
|
30
|
+
Readline.readline("\x1b[34m┃ > \x1b[33m", true)
|
31
|
+
rescue Interrupt
|
32
|
+
nil
|
33
|
+
end
|
34
|
+
print "\e[0m" # reset colour
|
35
|
+
result
|
36
|
+
end
|
37
|
+
|
38
|
+
def prompt_with_options(question, options)
|
39
|
+
log(question)
|
40
|
+
log("Your options are:")
|
41
|
+
options.each.with_index(1) do |v, idx|
|
42
|
+
log("#{idx}) #{v}")
|
43
|
+
end
|
44
|
+
log("Choose a number between 1 and #{options.length}")
|
45
|
+
|
46
|
+
Readline.completion_append_character = " "
|
47
|
+
Readline.completion_proc = nil
|
48
|
+
|
49
|
+
buf = -1
|
50
|
+
available = (1..options.length).to_a
|
51
|
+
until available.include?(buf.to_i)
|
52
|
+
begin
|
53
|
+
buf = Readline.readline("\x1b[34m┃ > \x1b[33m", true)
|
54
|
+
rescue Interrupt
|
55
|
+
nil
|
56
|
+
end
|
57
|
+
|
58
|
+
if buf.nil?
|
59
|
+
STDERR.puts
|
60
|
+
next
|
61
|
+
end
|
62
|
+
|
63
|
+
buf = buf.chomp
|
64
|
+
buf = -1 if buf.empty?
|
65
|
+
buf = -1 if buf.to_i.to_s != buf
|
66
|
+
end
|
67
|
+
|
68
|
+
print "\e[0m" # reset colour
|
69
|
+
options[buf.to_i - 1]
|
70
|
+
end
|
71
|
+
|
72
|
+
## Fancy Headers and Footers
|
73
|
+
|
74
|
+
def put_header(text = "", color = DEFAULT_COLOR)
|
75
|
+
put_edge(color, "┏━━ ", text)
|
76
|
+
end
|
77
|
+
|
78
|
+
def put_footer(color = DEFAULT_COLOR)
|
79
|
+
put_edge(color, "┗", "")
|
80
|
+
end
|
81
|
+
|
82
|
+
def put_edge(color, prefix, text)
|
83
|
+
ptext = "#{color}#{prefix}#{text}"
|
84
|
+
textwidth = printing_width(ptext)
|
85
|
+
|
86
|
+
termwidth = IO.respond_to?(:console) && IO.console ? IO.console.winsize[1] : 80
|
87
|
+
termwidth = 30 if termwidth < 30
|
88
|
+
|
89
|
+
if textwidth > termwidth
|
90
|
+
ptext = ptext[0...termwidth]
|
91
|
+
textwidth = termwidth
|
92
|
+
end
|
93
|
+
padwidth = termwidth - textwidth
|
94
|
+
pad = "━" * padwidth
|
95
|
+
formatted = "#{ptext}#{color}#{pad}\x1b[0m\n"
|
96
|
+
|
97
|
+
$stdout.puts formatted
|
98
|
+
end
|
99
|
+
|
100
|
+
# ANSI escape sequences (like \x1b[31m) have zero width.
|
101
|
+
# when calculating the padding width, we must exclude them.
|
102
|
+
def printing_width(str)
|
103
|
+
str.gsub(/\x1b\[[\d;]+[A-z]/, '').size
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
|
2
|
+
module NewsScraper
|
3
|
+
class ResponseError < StandardError; end
|
4
|
+
|
5
|
+
module Transformers
|
6
|
+
class ScrapePatternNotDefined < StandardError
|
7
|
+
attr_reader :root_domain, :uri
|
8
|
+
|
9
|
+
def initialize(opts = {})
|
10
|
+
@root_domain = opts[:root_domain]
|
11
|
+
@uri = opts[:uri]
|
12
|
+
super
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|