webxtractor 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/webxtractor.rb +33 -0
  3. metadata +84 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2648fcab56f4879a51dbaeb6300a572a71b43ac2
4
+ data.tar.gz: ec3d4bb9409bf610379e632912f91640dc0305ce
5
+ SHA512:
6
+ metadata.gz: d6f48d46163786d466d87aeec573d932494557dd34ee01d7419a115fdc8ab9176a5a8f60e112c2ce624e3b513414374051fa984ed816a7cadbe4b964fec4fdbb
7
+ data.tar.gz: fdba6ee9ee7a7d7b64b16ea0e9e803a83b0883640e5bd964c4b183dd21098f5c9576a9d55d61809dbbeb11d3dd0049ce387b77b110bec40840ea39f844ee8c87
@@ -0,0 +1,33 @@
1
+ require 'ostruct'
2
+ require 'nokogiri'
3
+
4
+ class Webxtractor
5
+ def self.get(url=nil)
6
+ return unless url
7
+ uri = URI.parse(url)
8
+ parse(uri.read)
9
+ end
10
+
11
+ def self.parse(body)
12
+ page = Nokogiri::HTML(body)
13
+ result = OpenStruct.new
14
+ result.title = get_tag('title', page)
15
+ result.h1 = get_tag('h1', page)
16
+ result
17
+ end
18
+
19
+ def self.get_tag(selector, page)
20
+ element = page.css(selector)
21
+ if element.size > 1
22
+ element.map {|x| normalize(x.text) }
23
+ else
24
+ normalize(element.text)
25
+ end
26
+ end
27
+
28
+ def self.normalize(content='')
29
+ return if content.nil?
30
+ content.gsub(/(\r\n|\n|\r)/," ")
31
+ content.gsub(/\s+/, " ").strip
32
+ end
33
+ end
metadata ADDED
@@ -0,0 +1,84 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: webxtractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ platform: ruby
6
+ authors:
7
+ - schmierkov
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-01-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.6.7.2
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.6'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.6.7.2
33
+ - !ruby/object:Gem::Dependency
34
+ name: pry-byebug
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '3.3'
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 3.3.0
43
+ type: :development
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: '3.3'
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 3.3.0
53
+ description: A simple content extractor
54
+ email: github@schmierkov.de
55
+ executables: []
56
+ extensions: []
57
+ extra_rdoc_files: []
58
+ files:
59
+ - lib/webxtractor.rb
60
+ homepage: https://github.com/schmierkov/webxtractor
61
+ licenses:
62
+ - MIT
63
+ metadata: {}
64
+ post_install_message:
65
+ rdoc_options: []
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: '0'
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ requirements: []
79
+ rubyforge_project:
80
+ rubygems_version: 2.4.8
81
+ signing_key:
82
+ specification_version: 4
83
+ summary: Extracts meta informations from a HTML Page
84
+ test_files: []