vbulletin_scraper 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +8 -2
- data/lib/vbulletin_scraper/V4/forum_scraper.rb +43 -0
- data/lib/{V4 → vbulletin_scraper/V4}/post_scraper.rb +4 -50
- data/lib/vbulletin_scraper/V4/quote_scraper.rb +23 -0
- data/lib/{V4/quote_scraper.rb → vbulletin_scraper/V4/scraper.rb} +25 -20
- data/lib/{V4 → vbulletin_scraper/V4}/topic_scraper.rb +3 -61
- data/lib/vbulletin_scraper/version.rb +1 -1
- data/lib/vbulletin_scraper.rb +5 -5
- data/vbulletin_scraper.gemspec +19 -17
- metadata +36 -7
- data/lib/V4/forum_scraper.rb +0 -101
- /data/lib/{configuration.rb → vbulletin_scraper/configuration.rb} +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 75834d520af865ec1aef5d413b34d39015c51ac6
|
4
|
+
data.tar.gz: 458f44485c20b301bb24cae79dcd2bbd309a2fb5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b27b992806780f92751cdf860c2df713e9ac6cf2bde735a197f703e9bc4c873b114feac089058b5d88bd4fa9333faef2face7546f28a4a298b9810de341c2863
|
7
|
+
data.tar.gz: 20e582041beb4f28793e4cabb6e651aa92b533daad3a57b22cf6b0239ac2abc18b6c8b867ea37d4e4f86e53233944720777562251f33bd5ca9e530a89c647784
|
data/Rakefile
CHANGED
@@ -0,0 +1,43 @@
|
|
1
|
+
require_relative 'scraper'
|
2
|
+
|
3
|
+
module VbulletinScraper
|
4
|
+
module V4
|
5
|
+
class ForumScraper < Scraper
|
6
|
+
def is_valid_vbulletin
|
7
|
+
if get_vbulletin_version != ''
|
8
|
+
return true
|
9
|
+
else
|
10
|
+
return false
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def get_vbulletin_version
|
15
|
+
vbulletinVersion = get_item_by_selector_with_attribute('meta[name="generator"]', 'content')
|
16
|
+
if vbulletinVersion != nil
|
17
|
+
return get_raw_text(vbulletinVersion.gsub('vBulletin', ''))
|
18
|
+
end
|
19
|
+
return ''
|
20
|
+
end
|
21
|
+
|
22
|
+
def get_forum_url
|
23
|
+
pageUrl = get_item_by_selector_with_attribute('base', 'href')
|
24
|
+
if pageUrl != nil
|
25
|
+
return get_raw_text(pageUrl)
|
26
|
+
end
|
27
|
+
return ''
|
28
|
+
end
|
29
|
+
|
30
|
+
def get_forum_title
|
31
|
+
forumTitle = get_item_by_selector_with_attribute('#logo img', 'alt')
|
32
|
+
if forumTitle == ''
|
33
|
+
forumTitle = get_item_by_selector_with_attribute('.logo img', 'title')
|
34
|
+
end
|
35
|
+
if forumTitle != nil
|
36
|
+
return get_raw_text(forumTitle)
|
37
|
+
else
|
38
|
+
return ''
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -1,10 +1,8 @@
|
|
1
|
-
|
2
|
-
require 'open-uri'
|
3
|
-
require 'open_uri_redirections'
|
1
|
+
require_relative 'scraper'
|
4
2
|
|
5
3
|
module VbulletinScraper
|
6
4
|
module V4
|
7
|
-
class PostScraper
|
5
|
+
class PostScraper < Scraper
|
8
6
|
def initialize(input)
|
9
7
|
@data = nil
|
10
8
|
if input.start_with? "http" || "www"
|
@@ -45,7 +43,8 @@ module VbulletinScraper
|
|
45
43
|
if postContent != nil
|
46
44
|
postContentNoQuotes = Nokogiri::HTML.fragment(postContent.inner_html)
|
47
45
|
postContentNoQuotes.search('div').remove
|
48
|
-
|
46
|
+
postContentNoQuotes.search('comment()').remove
|
47
|
+
return get_raw_text(postContentNoQuotes.to_s)
|
49
48
|
end
|
50
49
|
return ''
|
51
50
|
end
|
@@ -90,51 +89,6 @@ module VbulletinScraper
|
|
90
89
|
end
|
91
90
|
return ''
|
92
91
|
end
|
93
|
-
|
94
|
-
def get_item_by_selector(selector)
|
95
|
-
if @data != nil
|
96
|
-
if @data.at_css(selector)
|
97
|
-
return @data.at_css(selector)
|
98
|
-
end
|
99
|
-
end
|
100
|
-
return nil
|
101
|
-
end
|
102
|
-
|
103
|
-
def get_items_by_selector(selector)
|
104
|
-
if @data != nil
|
105
|
-
if @data.css(selector)
|
106
|
-
return @data.css(selector)
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
110
|
-
|
111
|
-
def get_item_by_selector_with_attribute(selector, attribute)
|
112
|
-
if @data != nil
|
113
|
-
if @data.at_css(selector)
|
114
|
-
return @data.at_css(selector)[attribute]
|
115
|
-
end
|
116
|
-
end
|
117
|
-
return nil
|
118
|
-
end
|
119
|
-
|
120
|
-
def get_raw_text(input)
|
121
|
-
if input != nil
|
122
|
-
return input.strip.gsub(/\u00a0/, ' ').gsub('\t', '')
|
123
|
-
else
|
124
|
-
return nil
|
125
|
-
end
|
126
|
-
end
|
127
|
-
|
128
|
-
def get_int(input)
|
129
|
-
if input != nil
|
130
|
-
if input != ''
|
131
|
-
begin
|
132
|
-
return input.to_i
|
133
|
-
end
|
134
|
-
end
|
135
|
-
end
|
136
|
-
return 0
|
137
|
-
end
|
138
92
|
end
|
139
93
|
end
|
140
94
|
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require_relative 'scraper'
|
2
|
+
|
3
|
+
module VbulletinScraper
|
4
|
+
module V4
|
5
|
+
class QuoteScraper < Scraper
|
6
|
+
def get_quote_author
|
7
|
+
quoteAuthor = get_item_by_selector('.bbcode_postedby strong')
|
8
|
+
if quoteAuthor != nil
|
9
|
+
return get_raw_text(quoteAuthor.text)
|
10
|
+
end
|
11
|
+
return ''
|
12
|
+
end
|
13
|
+
|
14
|
+
def get_quote_content
|
15
|
+
quoteContent = get_item_by_selector('.message')
|
16
|
+
if quoteContent != nil
|
17
|
+
return get_raw_text(quoteContent.text)
|
18
|
+
end
|
19
|
+
return ''
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -4,7 +4,9 @@ require 'open_uri_redirections'
|
|
4
4
|
|
5
5
|
module VbulletinScraper
|
6
6
|
module V4
|
7
|
-
class
|
7
|
+
class Scraper
|
8
|
+
attr_accessor :data
|
9
|
+
|
8
10
|
def initialize(input)
|
9
11
|
@data = nil
|
10
12
|
if input.start_with? "http" || "www"
|
@@ -16,24 +18,8 @@ module VbulletinScraper
|
|
16
18
|
end
|
17
19
|
end
|
18
20
|
|
19
|
-
def get_quote_author
|
20
|
-
quoteAuthor = get_item_by_selector('.bbcode_postedby strong')
|
21
|
-
if quoteAuthor != nil
|
22
|
-
return get_raw_text(quoteAuthor.text)
|
23
|
-
end
|
24
|
-
return ''
|
25
|
-
end
|
26
|
-
|
27
|
-
def get_quote_content
|
28
|
-
quoteContent = get_item_by_selector('.message')
|
29
|
-
if quoteContent != nil
|
30
|
-
return get_raw_text(quoteContent.text)
|
31
|
-
end
|
32
|
-
return ''
|
33
|
-
end
|
34
|
-
|
35
21
|
def get_item_by_selector(selector)
|
36
|
-
if
|
22
|
+
if !@data.nil?
|
37
23
|
if @data.at_css(selector)
|
38
24
|
return @data.at_css(selector)
|
39
25
|
end
|
@@ -41,8 +27,16 @@ module VbulletinScraper
|
|
41
27
|
return nil
|
42
28
|
end
|
43
29
|
|
30
|
+
def get_items_by_selector(selector)
|
31
|
+
if !@data.nil?
|
32
|
+
if @data.css(selector)
|
33
|
+
return @data.css(selector)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
44
38
|
def get_item_by_selector_with_attribute(selector, attribute)
|
45
|
-
if
|
39
|
+
if !@data.nil?
|
46
40
|
if @data.at_css(selector)
|
47
41
|
return @data.at_css(selector)[attribute]
|
48
42
|
end
|
@@ -51,12 +45,23 @@ module VbulletinScraper
|
|
51
45
|
end
|
52
46
|
|
53
47
|
def get_raw_text(input)
|
54
|
-
if input
|
48
|
+
if !input.nil?
|
55
49
|
return input.strip.gsub(/\u00a0/, ' ')
|
56
50
|
else
|
57
51
|
return nil
|
58
52
|
end
|
59
53
|
end
|
54
|
+
|
55
|
+
def get_int(input)
|
56
|
+
if !input.nil?
|
57
|
+
if input != ''
|
58
|
+
begin
|
59
|
+
return input.to_i
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
return 0
|
64
|
+
end
|
60
65
|
end
|
61
66
|
end
|
62
67
|
end
|
@@ -1,21 +1,8 @@
|
|
1
|
-
|
2
|
-
require 'open-uri'
|
3
|
-
require 'open_uri_redirections'
|
1
|
+
require_relative 'scraper'
|
4
2
|
|
5
3
|
module VbulletinScraper
|
6
4
|
module V4
|
7
|
-
class TopicScraper
|
8
|
-
def initialize(input)
|
9
|
-
@data = nil
|
10
|
-
if input.start_with? "http" || "www"
|
11
|
-
@data = Nokogiri::HTML(open(input, :allow_redirections => :all))
|
12
|
-
@data.encoding = "UTF-8"
|
13
|
-
else
|
14
|
-
@data = Nokogiri::HTML(input)
|
15
|
-
@data.encoding = "UTF-8"
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
5
|
+
class TopicScraper < Scraper
|
19
6
|
def is_valid_vbulletin
|
20
7
|
if get_vbulletin_version != ''
|
21
8
|
return true
|
@@ -34,7 +21,7 @@ module VbulletinScraper
|
|
34
21
|
|
35
22
|
def get_current_page_number
|
36
23
|
if is_valid_vbulletin
|
37
|
-
pageNumber = get_item_by_selector('#pagination_top a.popupctrl')
|
24
|
+
pageNumber = get_item_by_selector('#pagination_top a.popupctrl')
|
38
25
|
if pageNumber != nil
|
39
26
|
pageNumber = pageNumber.text.gsub('Page', '').gsub(' ', '').split('of').first
|
40
27
|
return get_int(get_raw_text(pageNumber))
|
@@ -95,51 +82,6 @@ module VbulletinScraper
|
|
95
82
|
return []
|
96
83
|
end
|
97
84
|
end
|
98
|
-
|
99
|
-
def get_item_by_selector(selector)
|
100
|
-
if @data != nil
|
101
|
-
if @data.at_css(selector)
|
102
|
-
return @data.at_css(selector)
|
103
|
-
end
|
104
|
-
end
|
105
|
-
return nil
|
106
|
-
end
|
107
|
-
|
108
|
-
def get_items_by_selector(selector)
|
109
|
-
if @data != nil
|
110
|
-
if @data.css(selector)
|
111
|
-
return @data.css(selector)
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
def get_item_by_selector_with_attribute(selector, attribute)
|
117
|
-
if @data != nil
|
118
|
-
if @data.at_css(selector)
|
119
|
-
return @data.at_css(selector)[attribute]
|
120
|
-
end
|
121
|
-
end
|
122
|
-
return nil
|
123
|
-
end
|
124
|
-
|
125
|
-
def get_raw_text(input)
|
126
|
-
if input != nil
|
127
|
-
return input.strip.gsub(/\u00a0/, ' ')
|
128
|
-
else
|
129
|
-
return nil
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
def get_int(input)
|
134
|
-
if input != nil
|
135
|
-
if input != ''
|
136
|
-
begin
|
137
|
-
return input.to_i
|
138
|
-
end
|
139
|
-
end
|
140
|
-
end
|
141
|
-
return 0
|
142
|
-
end
|
143
85
|
end
|
144
86
|
end
|
145
87
|
end
|
data/lib/vbulletin_scraper.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
require_relative 'vbulletin_scraper/version'
|
2
|
-
require_relative 'configuration'
|
3
|
-
require_relative 'V4/forum_scraper'
|
4
|
-
require_relative 'V4/topic_scraper'
|
5
|
-
require_relative 'V4/post_scraper'
|
6
|
-
require_relative 'V4/quote_scraper'
|
2
|
+
require_relative 'vbulletin_scraper/configuration'
|
3
|
+
require_relative 'vbulletin_scraper/V4/forum_scraper'
|
4
|
+
require_relative 'vbulletin_scraper/V4/topic_scraper'
|
5
|
+
require_relative 'vbulletin_scraper/V4/post_scraper'
|
6
|
+
require_relative 'vbulletin_scraper/V4/quote_scraper'
|
7
7
|
|
8
8
|
module VbulletinScraper
|
9
9
|
class << self
|
data/vbulletin_scraper.gemspec
CHANGED
@@ -4,24 +4,26 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
require 'vbulletin_scraper/version'
|
5
5
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
7
|
+
spec.name = "vbulletin_scraper"
|
8
|
+
spec.version = VbulletinScraper::VERSION
|
9
|
+
spec.authors = ["Ben Walters"]
|
10
|
+
spec.email = ["walters.benj@gmail.com"]
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
12
|
+
spec.summary = "This gem is designed to allow you to scrape compatible vBulletin forum threads for various data."
|
13
|
+
spec.homepage = "https://github.com/bendrick92/vbulletin_scraper"
|
14
|
+
spec.license = "MIT"
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
16
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
17
|
+
spec.bindir = "exe"
|
18
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
19
|
+
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
21
|
+
spec.add_dependency 'nokogiri', '~> 1.6.8'
|
22
|
+
spec.add_dependency 'open_uri_redirections'
|
23
|
+
|
24
|
+
spec.add_development_dependency "simplecov", "~> 0.12"
|
25
|
+
spec.add_development_dependency "codeclimate-test-reporter", "~> 0.6"
|
26
|
+
spec.add_development_dependency 'bundler', '~> 1.11'
|
27
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
28
|
+
spec.add_development_dependency 'rspec', '~> 3.2'
|
27
29
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vbulletin_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Walters
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-10-
|
11
|
+
date: 2016-10-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -38,6 +38,34 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: simplecov
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0.12'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0.12'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: codeclimate-test-reporter
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0.6'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0.6'
|
41
69
|
- !ruby/object:Gem::Dependency
|
42
70
|
name: bundler
|
43
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -96,12 +124,13 @@ files:
|
|
96
124
|
- Rakefile
|
97
125
|
- bin/console
|
98
126
|
- bin/setup
|
99
|
-
- lib/V4/forum_scraper.rb
|
100
|
-
- lib/V4/post_scraper.rb
|
101
|
-
- lib/V4/quote_scraper.rb
|
102
|
-
- lib/V4/topic_scraper.rb
|
103
|
-
- lib/configuration.rb
|
104
127
|
- lib/vbulletin_scraper.rb
|
128
|
+
- lib/vbulletin_scraper/V4/forum_scraper.rb
|
129
|
+
- lib/vbulletin_scraper/V4/post_scraper.rb
|
130
|
+
- lib/vbulletin_scraper/V4/quote_scraper.rb
|
131
|
+
- lib/vbulletin_scraper/V4/scraper.rb
|
132
|
+
- lib/vbulletin_scraper/V4/topic_scraper.rb
|
133
|
+
- lib/vbulletin_scraper/configuration.rb
|
105
134
|
- lib/vbulletin_scraper/version.rb
|
106
135
|
- vbulletin_scraper.gemspec
|
107
136
|
homepage: https://github.com/bendrick92/vbulletin_scraper
|
data/lib/V4/forum_scraper.rb
DELETED
@@ -1,101 +0,0 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
require 'open-uri'
|
3
|
-
require 'open_uri_redirections'
|
4
|
-
|
5
|
-
module VbulletinScraper
|
6
|
-
module V4
|
7
|
-
class ForumScraper
|
8
|
-
def initialize(input)
|
9
|
-
@data = nil
|
10
|
-
if input.start_with? "http" || "www"
|
11
|
-
@data = Nokogiri::HTML(open(input, :allow_redirections => :all))
|
12
|
-
@data.encoding = "UTF-8"
|
13
|
-
else
|
14
|
-
@data = Nokogiri::HTML(input)
|
15
|
-
@data.encoding = "UTF-8"
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
def is_valid_vbulletin
|
20
|
-
if get_vbulletin_version != ''
|
21
|
-
return true
|
22
|
-
else
|
23
|
-
return false
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
def get_vbulletin_version
|
28
|
-
vbulletinVersion = get_item_by_selector_with_attribute('meta[name="generator"]', 'content')
|
29
|
-
if vbulletinVersion != nil
|
30
|
-
return get_raw_text(vbulletinVersion.gsub('vBulletin', ''))
|
31
|
-
end
|
32
|
-
return ''
|
33
|
-
end
|
34
|
-
|
35
|
-
def get_forum_url
|
36
|
-
pageUrl = get_item_by_selector_with_attribute('base', 'href')
|
37
|
-
if pageUrl != nil
|
38
|
-
return get_raw_text(pageUrl)
|
39
|
-
end
|
40
|
-
return ''
|
41
|
-
end
|
42
|
-
|
43
|
-
def get_forum_title
|
44
|
-
forumTitle = get_item_by_selector_with_attribute('#logo img', 'alt')
|
45
|
-
if forumTitle == ''
|
46
|
-
forumTitle = get_item_by_selector_with_attribute('.logo img', 'title')
|
47
|
-
end
|
48
|
-
if forumTitle != nil
|
49
|
-
return get_raw_text(forumTitle)
|
50
|
-
else
|
51
|
-
return ''
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
def get_item_by_selector(selector)
|
56
|
-
if @data != nil
|
57
|
-
if @data.at_css(selector)
|
58
|
-
return @data.at_css(selector)
|
59
|
-
end
|
60
|
-
end
|
61
|
-
return nil
|
62
|
-
end
|
63
|
-
|
64
|
-
def get_items_by_selector(selector)
|
65
|
-
if @data != nil
|
66
|
-
if @data.css(selector)
|
67
|
-
return @data.css(selector)
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
def get_item_by_selector_with_attribute(selector, attribute)
|
73
|
-
if @data != nil
|
74
|
-
if @data.at_css(selector)
|
75
|
-
return @data.at_css(selector)[attribute]
|
76
|
-
end
|
77
|
-
end
|
78
|
-
return nil
|
79
|
-
end
|
80
|
-
|
81
|
-
def get_raw_text(input)
|
82
|
-
if input != nil
|
83
|
-
return input.strip.gsub(/\u00a0/, ' ')
|
84
|
-
else
|
85
|
-
return nil
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
def get_int(input)
|
90
|
-
if input != nil
|
91
|
-
if input != ''
|
92
|
-
begin
|
93
|
-
return input.to_i
|
94
|
-
end
|
95
|
-
end
|
96
|
-
end
|
97
|
-
return 0
|
98
|
-
end
|
99
|
-
end
|
100
|
-
end
|
101
|
-
end
|
File without changes
|