webxtractor 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/webxtractor.rb +33 -0
  3. metadata +84 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2648fcab56f4879a51dbaeb6300a572a71b43ac2
4
+ data.tar.gz: ec3d4bb9409bf610379e632912f91640dc0305ce
5
+ SHA512:
6
+ metadata.gz: d6f48d46163786d466d87aeec573d932494557dd34ee01d7419a115fdc8ab9176a5a8f60e112c2ce624e3b513414374051fa984ed816a7cadbe4b964fec4fdbb
7
+ data.tar.gz: fdba6ee9ee7a7d7b64b16ea0e9e803a83b0883640e5bd964c4b183dd21098f5c9576a9d55d61809dbbeb11d3dd0049ce387b77b110bec40840ea39f844ee8c87
@@ -0,0 +1,33 @@
1
+ require 'ostruct'
2
+ require 'nokogiri'
3
+
4
+ class Webxtractor
5
+ def self.get(url=nil)
6
+ return unless url
7
+ uri = URI.parse(url)
8
+ parse(uri.read)
9
+ end
10
+
11
+ def self.parse(body)
12
+ page = Nokogiri::HTML(body)
13
+ result = OpenStruct.new
14
+ result.title = get_tag('title', page)
15
+ result.h1 = get_tag('h1', page)
16
+ result
17
+ end
18
+
19
+ def self.get_tag(selector, page)
20
+ element = page.css(selector)
21
+ if element.size > 1
22
+ element.map {|x| normalize(x.text) }
23
+ else
24
+ normalize(element.text)
25
+ end
26
+ end
27
+
28
+ def self.normalize(content='')
29
+ return if content.nil?
30
+ content.gsub(/(\r\n|\n|\r)/," ")
31
+ content.gsub(/\s+/, " ").strip
32
+ end
33
+ end
metadata ADDED
@@ -0,0 +1,84 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: webxtractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ platform: ruby
6
+ authors:
7
+ - schmierkov
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-01-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.6.7.2
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.6'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.6.7.2
33
+ - !ruby/object:Gem::Dependency
34
+ name: pry-byebug
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '3.3'
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 3.3.0
43
+ type: :development
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: '3.3'
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 3.3.0
53
+ description: A simple content extractor
54
+ email: github@schmierkov.de
55
+ executables: []
56
+ extensions: []
57
+ extra_rdoc_files: []
58
+ files:
59
+ - lib/webxtractor.rb
60
+ homepage: https://github.com/schmierkov/webxtractor
61
+ licenses:
62
+ - MIT
63
+ metadata: {}
64
+ post_install_message:
65
+ rdoc_options: []
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: '0'
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ requirements: []
79
+ rubyforge_project:
80
+ rubygems_version: 2.4.8
81
+ signing_key:
82
+ specification_version: 4
83
+ summary: Extracts meta informations from a HTML Page
84
+ test_files: []