oald_parser 0.1.8 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/oald_parser/facade.rb +2 -2
- data/lib/oald_parser/formatter.rb +32 -9
- data/lib/oald_parser/page_downloader.rb +2 -3
- data/lib/oald_parser/page_parser.rb +55 -11
- data/lib/oald_parser.rb +16 -1
- metadata +2 -3
data/lib/oald_parser/facade.rb
CHANGED
@@ -14,9 +14,9 @@ module OaldParser
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def self.create_facade
|
17
|
-
downloader = PageDownloader.new('http://www.
|
17
|
+
downloader = PageDownloader.new('http://www.oxfordadvancedlearnersdictionary.com/dictionary')
|
18
18
|
parser = PageParser.new
|
19
|
-
formatter = Formatter.new(
|
19
|
+
formatter = Formatter.new(items: 5)
|
20
20
|
extractor = WordExtractor.new
|
21
21
|
Facade.new(downloader, parser, formatter, extractor)
|
22
22
|
end
|
@@ -6,18 +6,41 @@ module OaldParser
|
|
6
6
|
@options = options
|
7
7
|
end
|
8
8
|
|
9
|
-
def format(
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
nil
|
9
|
+
def format(page)
|
10
|
+
if !page.blocks.empty?
|
11
|
+
format_blocks(page.blocks)
|
12
|
+
else
|
13
|
+
format_items(page.items)
|
14
|
+
end
|
16
15
|
end
|
17
16
|
|
18
17
|
private
|
19
|
-
def
|
20
|
-
|
18
|
+
def format_blocks(blocks, limit = 1000)
|
19
|
+
blocks.collect do |block|
|
20
|
+
res = ''
|
21
|
+
res += block.text.upcase
|
22
|
+
res += "\n"
|
23
|
+
res += '-' * 20
|
24
|
+
res += "\n"
|
25
|
+
res += format_items(block.items)
|
26
|
+
res
|
27
|
+
end.join("\n\n")
|
28
|
+
end
|
29
|
+
|
30
|
+
def format_items(items, limit = 1000)
|
31
|
+
items.collect do |item|
|
32
|
+
res = ''
|
33
|
+
res += item.text
|
34
|
+
if !item.examples.empty?
|
35
|
+
res += "\n"
|
36
|
+
res += format_examples(item.examples)
|
37
|
+
end
|
38
|
+
res
|
39
|
+
end.join("\n\n")
|
40
|
+
end
|
41
|
+
|
42
|
+
def format_examples(examples)
|
43
|
+
examples.collect {|e| "+ #{e}"}.join("\n")
|
21
44
|
end
|
22
45
|
end
|
23
46
|
end
|
@@ -1,22 +1,66 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
1
3
|
module OaldParser
|
4
|
+
class Page
|
5
|
+
attr_reader :blocks, :items
|
6
|
+
def initialize(blocks, items)
|
7
|
+
@blocks, @items = blocks, items
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
class Block
|
12
|
+
attr_reader :text, :items
|
13
|
+
def initialize(text, items)
|
14
|
+
@text, @items = text, items
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
class Item
|
19
|
+
attr_reader :text, :examples
|
20
|
+
def initialize(text, examples)
|
21
|
+
@text, @examples = text, examples
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
2
25
|
class PageParser
|
3
26
|
def parse(page)
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
27
|
+
parsed = Nokogiri::HTML(page)
|
28
|
+
if blocks_on_page?(parsed)
|
29
|
+
Page.new(parse_block(parsed), [])
|
30
|
+
else
|
31
|
+
Page.new([], parse_items(parsed))
|
32
|
+
end
|
9
33
|
end
|
10
34
|
|
11
35
|
private
|
12
|
-
def
|
13
|
-
|
14
|
-
|
36
|
+
def blocks_on_page?(page)
|
37
|
+
page.css('div.sd-g').first
|
38
|
+
end
|
39
|
+
|
40
|
+
def parse_block(page)
|
41
|
+
block_nodes = page.css('div.sd-g')
|
42
|
+
block_nodes.collect do |block|
|
43
|
+
block_text = all_except(block, 'n-g')
|
44
|
+
items = parse_items(block)
|
45
|
+
Block.new(block_text, items)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def parse_items(block)
|
50
|
+
item_nodes = block.css('span.n-g')
|
51
|
+
item_nodes.collect do |item|
|
52
|
+
item_text = all_except(item, 'x-g')
|
53
|
+
example_nodes = item.css('span.x-g')
|
54
|
+
examples = example_nodes.collect{|e| e.text.strip}
|
55
|
+
Item.new(item_text, examples)
|
56
|
+
end
|
15
57
|
end
|
16
58
|
|
17
|
-
def
|
18
|
-
|
19
|
-
|
59
|
+
def all_except(item, class_name)
|
60
|
+
elements = item.children.find_all do |c|
|
61
|
+
!(c.name == 'span' && c[:class] == class_name)
|
62
|
+
end
|
63
|
+
elements.collect{|e|e.text}.join('').strip
|
20
64
|
end
|
21
65
|
end
|
22
66
|
end
|
data/lib/oald_parser.rb
CHANGED
@@ -2,4 +2,19 @@ require_relative 'oald_parser/facade'
|
|
2
2
|
require_relative 'oald_parser/formatter'
|
3
3
|
require_relative 'oald_parser/oald_parser_exception'
|
4
4
|
require_relative 'oald_parser/page_downloader'
|
5
|
-
require_relative 'oald_parser/page_parser'
|
5
|
+
require_relative 'oald_parser/page_parser'
|
6
|
+
|
7
|
+
|
8
|
+
#include OaldParser
|
9
|
+
#
|
10
|
+
#downloader = PageDownloader.new("http://www.oxfordadvancedlearnersdictionary.com/dictionary")
|
11
|
+
#page = downloader.download("surface")
|
12
|
+
#parser = PageParser.new
|
13
|
+
#parsed = parser.parse(page)
|
14
|
+
#formatter = Formatter.new(items: 15)
|
15
|
+
#puts formatter.format(parsed)
|
16
|
+
|
17
|
+
#class=sd-g block
|
18
|
+
#class=n-g new line
|
19
|
+
#class=x-g new list item
|
20
|
+
#class=xr-g delete
|