cx_extractor 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 311c0a47a060dc52cf13424390ee5a1ad413ed0909307e744c03b5842170b1cb
4
+ data.tar.gz: fcf666ae5b38807a38dbfadf7abffd1ed13089d3f894d7474ee9b33242f164a7
5
+ SHA512:
6
+ metadata.gz: 606cf753ae242db3407f84af1f20feeddc9207913d1a725aa255ade80646a096a9f41d216d2e25a9dc8bf39a7c3fd8c1b6841222100a97ed06e4dc5ae415d656
7
+ data.tar.gz: 5e374a30a8a6fac753507653a99120cf464a7c30bf556e6ee0bac45c030c737a67d7ac67697037944674faf019db2931e8f7aef4b341cdd29356a24f901fd1b9
@@ -0,0 +1,13 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+ distribution.png
10
+ # rspec failure tracking
11
+ .rspec_status
12
+ .DS_Store
13
+ *.py
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
@@ -0,0 +1,10 @@
1
+
2
+ Metrics/MethodLength:
3
+ Max: 20
4
+
5
+ Metrics/AbcSize:
6
+ Max: 20
7
+
8
+ Metrics/LineLength:
9
+ Max: 100
10
+
@@ -0,0 +1,9 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.3.3
5
+ before_install:
6
+ - sudo apt-get install imagemagick
7
+ - gem install bundler -v 1.16.2
8
+ script:
9
+ - bundle exec rspec
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in cx_extractor.gemspec
6
+ gemspec
@@ -0,0 +1,51 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ cx_extractor (0.1.2)
5
+ gruff (~> 0.7.0)
6
+ nokogiri (~> 1.0)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ charlock_holmes (0.7.6)
12
+ diff-lcs (1.3)
13
+ ethon (0.12.0)
14
+ ffi (>= 1.3.0)
15
+ ffi (1.11.1)
16
+ gruff (0.7.0)
17
+ rmagick (~> 2.13, >= 2.13.4)
18
+ mini_portile2 (2.4.0)
19
+ nokogiri (1.10.3)
20
+ mini_portile2 (~> 2.4.0)
21
+ rake (10.5.0)
22
+ rmagick (2.16.0)
23
+ rspec (3.8.0)
24
+ rspec-core (~> 3.8.0)
25
+ rspec-expectations (~> 3.8.0)
26
+ rspec-mocks (~> 3.8.0)
27
+ rspec-core (3.8.0)
28
+ rspec-support (~> 3.8.0)
29
+ rspec-expectations (3.8.3)
30
+ diff-lcs (>= 1.2.0, < 2.0)
31
+ rspec-support (~> 3.8.0)
32
+ rspec-mocks (3.8.0)
33
+ diff-lcs (>= 1.2.0, < 2.0)
34
+ rspec-support (~> 3.8.0)
35
+ rspec-support (3.8.0)
36
+ typhoeus (1.3.1)
37
+ ethon (>= 0.9.0)
38
+
39
+ PLATFORMS
40
+ ruby
41
+
42
+ DEPENDENCIES
43
+ bundler (~> 1.16)
44
+ charlock_holmes
45
+ cx_extractor!
46
+ rake (~> 10.0)
47
+ rspec (~> 3.0)
48
+ typhoeus
49
+
50
+ BUNDLED WITH
51
+ 1.16.6
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2018 Feng Ce
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,39 @@
1
+ # CxExtractor
2
+
3
+ [![Build Status](https://travis-ci.org/fcce/cx_extractor.svg?branch=master)](https://travis-ci.org/fcce/cx_extractor)
4
+
5
+ Extract article from the web page, this gem is appropriate for the web page which contains lots of text.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'cx_extractor'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ brew install imagemagick
18
+ brew link --force imagemagick@6
19
+ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ gem install cx_extractor
24
+
25
+ ## Usage
26
+
27
+
28
+
29
+ ## Contributing
30
+
31
+ Bug reports and pull requests are welcome on GitHub at https://github.com/fcce/cx_extractor. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
32
+
33
+ ## License
34
+
35
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
36
+
37
+ ## Code of Conduct
38
+
39
+ Everyone interacting in the CxExtractor project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/[USERNAME]/cx_extractor/blob/master/CODE_OF_CONDUCT.md).
@@ -0,0 +1,6 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task default: :spec
@@ -0,0 +1 @@
1
+ theme: jekyll-theme-cayman
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'cx_extractor'
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require 'irb'
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,34 @@
1
+ lib = File.expand_path('lib', __dir__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'cx_extractor/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'cx_extractor'
7
+ spec.version = CxExtractor::VERSION
8
+ spec.authors = ['Feng Ce']
9
+ spec.email = ['kalelfc@gmail.com']
10
+
11
+ spec.summary = 'Used to extract text from the web page.'
12
+ spec.description = <<-CONTENT
13
+ Used to extract text from the web page.
14
+ This tool is appropriate for the web page which contains lots of text.
15
+ CONTENT
16
+ spec.homepage = 'https://fcce.github.io/cx_extractor/'
17
+ spec.license = 'MIT'
18
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
19
+ `git ls-files -z`.split("\x0").reject do |f|
20
+ f.match(%r{^(test|spec|features)/})
21
+ end
22
+ end
23
+ spec.bindir = 'exe'
24
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
25
+ spec.require_paths = ['lib']
26
+
27
+ spec.add_dependency 'gruff', '~> 0.7.0'
28
+ spec.add_dependency 'nokogiri', '~> 1.0'
29
+ spec.add_development_dependency 'bundler', '~> 1.16'
30
+ spec.add_development_dependency 'charlock_holmes'
31
+ spec.add_development_dependency 'rake', '~> 10.0'
32
+ spec.add_development_dependency 'rspec', '~> 3.0'
33
+ spec.add_development_dependency 'typhoeus'
34
+ end
@@ -0,0 +1,67 @@
1
+ require 'cx_extractor/version'
2
+ require 'cx_extractor/config'
3
+ require 'cx_extractor/chart'
4
+ require 'cx_extractor/utils'
5
+ require 'nokogiri'
6
+ require 'typhoeus'
7
+ require 'charlock_holmes'
8
+ # nodoc
9
+ module CxExtractor
10
+ extend Chart if explore_parent
11
+ extend Utils
12
+ class << self
13
+ TITLE_REGEXP = %r{<title>(.*?)</title>}.freeze
14
+
15
+ def article(html)
16
+ ctext = get_clean_text(html)
17
+ lines = ctext.split("\n").map(&:strip)
18
+ block_distribution = line_block_distribute(lines)
19
+ content = get_content(lines, block_distribution)
20
+ content = get_content_by_tag(html, content) if explore_parent
21
+ # content.gsub("\n",'') if remove_newline
22
+ content.squeeze.strip
23
+ end
24
+
25
+ def get_title(html)
26
+ matcher = TITLE_REGEXP.match(html) || []
27
+ matcher[1]
28
+ end
29
+
30
+ def get_content(lines, block_distribution)
31
+ from_line = to_line = 0
32
+ content = chart_points = []
33
+ loop do
34
+ from_line, to_line = get_contect_block(block_distribution, to_line)
35
+ content += lines[from_line..to_line]
36
+ break if from_line < 0
37
+
38
+ chart_points += [from_line, to_line]
39
+ end
40
+ if chart_distribution && !chart_points.empty?
41
+ chart(block_distribution, chart_points)
42
+ else
43
+ warn 'there is no content for the web page, cannot chart'
44
+ end
45
+ content.join("\n")
46
+ end
47
+
48
+ def get_contect_block(block_distribution, to_line)
49
+ from_line = find_surge(block_distribution, to_line, threshold)
50
+ to_line = find_dive(block_distribution, from_line)
51
+ [from_line, to_line]
52
+ end
53
+
54
+ def get_content_by_tag(html, block_content)
55
+ doc = Nokogiri::HTML(html)
56
+ p_doms = doc.css('p')
57
+ ptext = []
58
+ p_doms.each do |p_dom|
59
+ ptext << p_dom.parent if block_content.include?(p_dom.text)
60
+ end
61
+ max_p = ptext.max_by { |i| ptext.count(i) }
62
+ get_clean_text(max_p.to_s).split("\n").map(&:strip).join(
63
+ "\n"
64
+ ).squeeze
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,47 @@
1
+ require 'gruff'
2
+ module CxExtractor
3
+ # make a chart for block_distribution
4
+ module Chart
5
+ def cal_labels(distribution)
6
+ labels = {}
7
+ index_distribution_length = distribution.length
8
+ [0, 1, 2, 4, 8].each do |i|
9
+ v = index_distribution_length / (2**i)
10
+ labels[v] = v.to_s
11
+ end
12
+ percentile_seventy_five = index_distribution_length * 3 / 4
13
+ labels[percentile_seventy_five] = percentile_seventy_five.to_s
14
+ labels
15
+ end
16
+
17
+ def cal_color(index)
18
+ if index % 2 > 0
19
+ '#85AF99'
20
+ else
21
+ '#E5E5E5'
22
+ end
23
+ end
24
+
25
+ def chart(distribution, chart_points, filename = chart_file_name)
26
+ g = Gruff::Line.new
27
+ g.theme = chart_theme
28
+ g.labels = cal_labels(distribution)
29
+ chart_points.unshift 0
30
+ gruff_line(g, chart_points, distribution)
31
+ g.hide_legend = true
32
+ g.minimum_value = 0
33
+ g.write(filename)
34
+ end
35
+
36
+ def gruff_line(gruff, chart_points, distribution)
37
+ start_point = end_point = 0
38
+ chart_points.each_with_index do |break_point, index|
39
+ start_point = break_point
40
+ end_point = chart_points[index + 1] || distribution.length - 1
41
+ gruff.dataxy('line' + break_point.to_s,
42
+ (start_point..end_point).to_a,
43
+ distribution[start_point..end_point], cal_color(index))
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,37 @@
1
+ # configure
2
+ module CxExtractor
3
+ DEFAULTS = {
4
+ threshold: 86,
5
+ balck_width: 3,
6
+ explore_parent: true,
7
+ chart_distribution: false,
8
+ chart_file_name: 'distribution.png',
9
+ chart_theme: {
10
+ marker_color: '#AEA9A9',
11
+ font_color: 'black',
12
+ background_colors: 'white'
13
+ }
14
+ }.freeze
15
+
16
+ class << self
17
+ def options
18
+ @options ||= DEFAULTS.dup
19
+ end
20
+
21
+ attr_writer :options
22
+
23
+ def configure
24
+ yield self
25
+ end
26
+ end
27
+
28
+ DEFAULTS.each do |k, _v|
29
+ define_singleton_method "#{k}=" do |value|
30
+ options.merge!(k => value)
31
+ end
32
+
33
+ define_singleton_method k do
34
+ options[k]
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,63 @@
1
+ module CxExtractor
2
+ # utils for cx_extractor
3
+ module Utils
4
+ def line_block_distribute(lines)
5
+ block_distribution = []
6
+ index_distribution = lines.map(&:length)
7
+ (0...(lines.length - balck_width + 1)).each do |i|
8
+ word_num = 0
9
+ (0...balck_width).each do |j|
10
+ word_num += index_distribution[i + j]
11
+ end
12
+ block_distribution[i] = word_num
13
+ end
14
+ block_distribution
15
+ end
16
+
17
+ def find_surge(block_distribution, start, threshold)
18
+ ((start + 1)...block_distribution.length - 3).each do |index|
19
+ if block_distribution[index] > threshold && (
20
+ block_distribution[index + 1] > 0 ||
21
+ block_distribution[index + 2] > 0 ||
22
+ block_distribution[index + 3] > 0)
23
+ return index
24
+ end
25
+ end
26
+ -1
27
+ end
28
+
29
+ def find_dive(block_distribution, surge_point)
30
+ ((surge_point + 1)...(block_distribution.size - 2)).each do |index|
31
+ if block_distribution[index].zero? &&
32
+ block_distribution[index + 1].zero?
33
+ return index - 1
34
+ end
35
+ end
36
+ block_distribution.size - 1
37
+ end
38
+
39
+ def get_clean_text(dom)
40
+ # remove html comment
41
+ html = dom.clone
42
+ html.gsub!(/<!--.*?(.|\n)*?-->/, "\n")
43
+ # remove javascript
44
+ html.gsub!(%r{<script.*?>.*?(.|\n)*?</script>}, "\n")
45
+ # remove a
46
+ html.gsub!(%r{<a[\t|\n|\r|\f].*?>.*?</a>}, '')
47
+ # remove css
48
+ html.gsub!(%r{<style.*?>.*?(.|\n)*?</style>}, "\n")
49
+ # remove tag
50
+ html.gsub!(/<.*?(.|\n)*?>/, '')
51
+ replace_special_char(html)
52
+ end
53
+
54
+ def replace_special_char(str)
55
+ str.gsub!('&#8226;', '·')
56
+ str.gsub!('&amp;', '&')
57
+ str.gsub!('&nbsp;', ' ')
58
+ str.gsub!('&copy;', '@')
59
+ str.gsub!("\r\n|\r", "\n")
60
+ str
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,3 @@
1
+ module CxExtractor
2
+ VERSION = '0.1.2'.freeze
3
+ end
metadata ADDED
@@ -0,0 +1,162 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cx_extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.2
5
+ platform: ruby
6
+ authors:
7
+ - Feng Ce
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2019-06-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: gruff
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.7.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.7.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.16'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.16'
55
+ - !ruby/object:Gem::Dependency
56
+ name: charlock_holmes
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rake
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '10.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '10.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rspec
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '3.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '3.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: typhoeus
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: |
112
+ Used to extract text from the web page.
113
+ This tool is appropriate for the web page which contains lots of text.
114
+ email:
115
+ - kalelfc@gmail.com
116
+ executables: []
117
+ extensions: []
118
+ extra_rdoc_files: []
119
+ files:
120
+ - ".gitignore"
121
+ - ".rspec"
122
+ - ".rubocop.yml"
123
+ - ".travis.yml"
124
+ - Gemfile
125
+ - Gemfile.lock
126
+ - LICENSE.txt
127
+ - README.md
128
+ - Rakefile
129
+ - _config.yml
130
+ - bin/console
131
+ - bin/setup
132
+ - cx_extractor.gemspec
133
+ - lib/cx_extractor.rb
134
+ - lib/cx_extractor/chart.rb
135
+ - lib/cx_extractor/config.rb
136
+ - lib/cx_extractor/utils.rb
137
+ - lib/cx_extractor/version.rb
138
+ homepage: https://fcce.github.io/cx_extractor/
139
+ licenses:
140
+ - MIT
141
+ metadata: {}
142
+ post_install_message:
143
+ rdoc_options: []
144
+ require_paths:
145
+ - lib
146
+ required_ruby_version: !ruby/object:Gem::Requirement
147
+ requirements:
148
+ - - ">="
149
+ - !ruby/object:Gem::Version
150
+ version: '0'
151
+ required_rubygems_version: !ruby/object:Gem::Requirement
152
+ requirements:
153
+ - - ">="
154
+ - !ruby/object:Gem::Version
155
+ version: '0'
156
+ requirements: []
157
+ rubyforge_project:
158
+ rubygems_version: 2.7.9
159
+ signing_key:
160
+ specification_version: 4
161
+ summary: Used to extract text from the web page.
162
+ test_files: []