onebox 1.4.9 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/onebox/engine/wikipedia_onebox.rb +45 -5
- data/lib/onebox/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5b5e72349ed5dcca5148875d8e669b3850de0311
|
4
|
+
data.tar.gz: b1d032a771038719f1c51064fee5942c50f94353
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd575f2a3a215a63a5c9359e788cb5c3740b8b7b1d53ad59cbdf9db2e120fc63c95e6c96432f11b38f5dbd8d9a7fff41ce722fe51c70e3843bbad2b5362be229
|
7
|
+
data.tar.gz: 6cb0cc9aef0081ba65b35221261492b44cf6ca161161d50044c1c108d7732a4f9c840867907d8ca1d9ea308430378cfe5f45909a82a31461fd0939ac85399d96
|
@@ -10,26 +10,66 @@ module Onebox
|
|
10
10
|
private
|
11
11
|
|
12
12
|
def data
|
13
|
-
|
14
|
-
paras = raw.search("p")
|
13
|
+
paras = []
|
15
14
|
text = ""
|
16
15
|
|
16
|
+
# Detect section Hash in the url and retrive the related paragraphs. if no hash provided the first few paragraphs will be used
|
17
|
+
# Author Lidlanca
|
18
|
+
# Date 9/8/2014
|
19
|
+
if ( m_url_hash = @url.match /#([^\/?]+)/ ) #extract url hash
|
20
|
+
m_url_hash_name= m_url_hash[1]
|
21
|
+
end
|
22
|
+
|
23
|
+
unless m_url_hash.nil?
|
24
|
+
section_header_title = raw.xpath("//span[@id='#{m_url_hash_name}']")
|
25
|
+
|
26
|
+
if section_header_title.empty?
|
27
|
+
paras = raw.search("p") #default get all the paras
|
28
|
+
else
|
29
|
+
section_title_text = section_header_title.inner_text
|
30
|
+
section_header = section_header_title[0].parent #parent element of the section span element should be an <h3> node
|
31
|
+
cur_element = section_header
|
32
|
+
|
33
|
+
# p|text|div covers the general case. We assume presence of atleast 1 P node. if section has no P node we may end up with a P node from the next section.
|
34
|
+
# div tag is commonly used as an assets wraper in an article section. often as the first element holding an image.
|
35
|
+
# ul support will imporve the output generated for a section with a list as the main content (for example: an Author Bibliography, A musician Discography, etc)
|
36
|
+
first_p_found = nil
|
37
|
+
while ( ((next_sibling = cur_element.next_sibling).name =~ /p|text|div|ul/) || first_p_found.nil? ) do #from section header get the next sibling until it is a breaker tag
|
38
|
+
cur_element = next_sibling
|
39
|
+
if (cur_element.name == "p" || cur_element.name == "ul") #we treat a list as we detect a p to avoid showing
|
40
|
+
first_p_found = true
|
41
|
+
paras.push(cur_element)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
else # no hash found in url
|
46
|
+
paras = raw.search("p") #default get all the paras
|
47
|
+
end
|
48
|
+
|
17
49
|
unless paras.empty?
|
18
50
|
cnt = 0
|
19
51
|
while text.length < Onebox::LayoutSupport.max_text && cnt <= 3
|
20
52
|
break if cnt >= paras.size
|
21
53
|
text << " " unless cnt == 0
|
22
|
-
|
54
|
+
|
55
|
+
if paras[cnt].name =="ul" #Handle UL tag. Generate a textual ordered list (1.item | 2.item | 3.item). Unfourtently no newline allowed in output
|
56
|
+
li_index=1
|
57
|
+
list_items = []
|
58
|
+
paras[cnt].children.css("li").each {|li| list_items.push "#{li_index}." + li.inner_text ; li_index+=1}
|
59
|
+
paragraph = (list_items.join " |\n ")[0..Onebox::LayoutSupport.max_text]
|
60
|
+
else
|
61
|
+
paragraph = paras[cnt].inner_text[0..Onebox::LayoutSupport.max_text]
|
62
|
+
end
|
63
|
+
|
23
64
|
paragraph.gsub!(/\[\d+\]/mi, "")
|
24
65
|
text << paragraph
|
25
66
|
cnt += 1
|
26
67
|
end
|
27
68
|
end
|
28
|
-
|
29
69
|
text = "#{text[0..Onebox::LayoutSupport.max_text]}..." if text.length > Onebox::LayoutSupport.max_text
|
30
70
|
result = {
|
31
71
|
link: link,
|
32
|
-
title: raw.css("html body h1").inner_text,
|
72
|
+
title: raw.css("html body h1").inner_text + (section_title_text ? " | " + section_title_text : ""), #if a section sub title exists add it to the main article title
|
33
73
|
description: text
|
34
74
|
}
|
35
75
|
img = raw.css(".image img")
|
data/lib/onebox/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: onebox
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joanna Zeta
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2014-
|
13
|
+
date: 2014-09-09 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: multi_json
|
@@ -342,7 +342,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
342
342
|
version: '0'
|
343
343
|
requirements: []
|
344
344
|
rubyforge_project:
|
345
|
-
rubygems_version: 2.
|
345
|
+
rubygems_version: 2.2.2
|
346
346
|
signing_key:
|
347
347
|
specification_version: 4
|
348
348
|
summary: A gem for turning URLs into previews.
|