iudex-html 1.2.b.0-java → 1.2.b.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.rdoc CHANGED
@@ -1,3 +1,9 @@
1
+ === 1.2.b.1 (2012-5-31)
2
+ * Add support for HTML 5 (draft) tags, attributes
3
+ * Neko parser support for HTML 5 <meta charset>
4
+ * Neko parser keeps non-HTML attributes when skipBanned = false
5
+ * Add options, barc read support to iudex-html-clean
6
+
1
7
  === 1.2.b.0 (2012-3-4)
2
8
  * Upgrade to gravitext-xmlprod ~> 1.5.b
3
9
  * Fix duplicate attributes from Neko, last value wins.
data/Manifest.txt CHANGED
@@ -25,4 +25,4 @@ test/test_parse_filter.rb
25
25
  test/test_stax_parser.rb
26
26
  test/test_tree_walker.rb
27
27
  test/test_word_counters.rb
28
- lib/iudex-html/iudex-html-1.2.b.0.jar
28
+ lib/iudex-html/iudex-html-1.2.b.1.jar
data/bin/iudex-html-clean CHANGED
@@ -19,40 +19,128 @@
19
19
  $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
20
 
21
21
  require 'rubygems'
22
- require 'rjack-logback'
23
22
 
24
- require 'iudex-html'
25
- require 'iudex-filter/key_helper'
23
+ module IudexBinScript
24
+ require 'rjack-logback'
25
+ include RJack
26
26
 
27
- require 'gravitext-xmlprod/extensions'
27
+ Logback.config_console( :stderr => true )
28
28
 
29
- require 'java'
29
+ require 'iudex-html'
30
+ require 'iudex-filter/key_helper'
30
31
 
31
- class HTMLCleaner
32
- include Iudex::HTML
33
- include Iudex::HTML::Tree
34
- include Iudex::HTML::Filters
35
- include Iudex::HTML::Tree::Filters
32
+ require 'gravitext-xmlprod/extensions'
36
33
 
37
- import 'iudex.html.HTMLUtils'
38
- import 'iudex.html.tree.TreeWalker'
34
+ require 'java'
39
35
 
40
- def run( input = ARGV.first )
41
- source = HTMLUtils::source( IO.read( input ).to_java_bytes, "UTF-8" )
42
- tree = HTMLUtils::parse( source )
36
+ class HTMLCleaner
37
+ include Gravitext::XMLProd
38
+ include Iudex::Core
39
+ include Iudex::HTML
40
+ include Iudex::HTML::Tree
41
+ include Iudex::HTML::Filters
42
+ include Iudex::HTML::Tree::Filters
43
43
 
44
- tfc = TreeFilterChain.new( [ XmpToPreConverter.new,
45
- CSSDisplayFilter.new,
46
- AttributeCleaner.new,
47
- MojiBakeCleaner.new,
48
- CharactersNormalizer.new,
49
- EmptyInlineRemover.new ] )
44
+ import 'iudex.html.HTMLUtils'
45
+ import 'iudex.html.neko.NekoHTMLParser'
46
+ import 'iudex.html.tree.TreeWalker'
47
+ import 'iudex.http.Headers'
48
+ import 'iudex.util.Charsets'
50
49
 
51
- TreeWalker.walk_depth_first( tfc, tree )
50
+ def initialize
51
+ @default_encoding = "UTF-8"
52
+ @trim_non_displayed = false
53
+ @indentor = Indentor::PRETTY
54
+ end
55
+
56
+ def run( args = ARGV )
57
+ files = parse_args( args )
58
+
59
+ files.each do |f|
60
+ if f =~ /\.barc$/
61
+ process_barc( f )
62
+ else
63
+ process_file( f )
64
+ end
65
+ end
66
+
67
+ end
68
+
69
+ def process_file( fname )
70
+
71
+ input = if fname == '-'
72
+ $stdin.read
73
+ else
74
+ IO.read( fname )
75
+ end
76
+
77
+ source = HTMLUtils::source( input.to_java_bytes, @default_encoding )
78
+ process( source )
79
+ end
80
+
81
+ def process_barc( bname )
82
+ require 'iudex-barc' #FIXME: Undeclared
83
+ barc_file = Iudex::BARC::BARCFile.new( java.io.File.new( bname ) )
84
+ barc_reader = barc_file.reader
85
+ while( rec = barc_reader.next )
86
+ next unless rec.type.chr == 'H'
87
+ source = ContentSource.new( rec.body_input_stream )
88
+ ctype = Headers.content_type( rec.response_headers )
89
+ if ctype && ctype.charset
90
+ enc = Charsets.lookup( ctype.charset )
91
+ source.set_default_encoding( enc ) if enc
92
+ end
93
+ process( source )
94
+ end
95
+ end
96
+
97
+ def process( source )
98
+ parser = NekoHTMLParser.new
99
+ parser.skip_banned = @trim_non_displayed
100
+
101
+ tree = parser.parse( source )
102
+
103
+ filters = [ XmpToPreConverter.new,
104
+ ( [ CSSDisplayFilter.new,
105
+ AttributeCleaner.new ] if @trim_non_displayed ),
106
+ MojiBakeCleaner.new,
107
+ CharactersNormalizer.new,
108
+ EmptyInlineRemover.new ].flatten.compact
109
+
110
+ TreeWalker.walk_depth_first( TreeFilterChain.new( filters ),
111
+ tree )
112
+
113
+ puts tree.to_xml( :indentor => @indentor )
114
+ end
115
+
116
+ def parse_args( args = ARGV )
117
+ parser = OptionParser.new do |opts|
118
+ opts.banner =
119
+ "Usage: iudex-html-clean [options] (FILE|barc)...\n" +
120
+ "Options:\n"
121
+
122
+ opts.on( "-v", "--version", "Display version and exit" ) do
123
+ puts "iudex-html: #{Iudex::HTML::VERSION}"
124
+ exit 1
125
+ end
126
+
127
+ opts.on( "-t", "--trim-non-display",
128
+ "Trim banned/non-displayed elements from output" ) do
129
+ @trim_non_displayed = true
130
+ end
131
+
132
+ opts.on( "-i", "--indentor NAME",
133
+ "Specify indentor to use for output: " +
134
+ "PRETTY (default), COMPACT" ) do |name|
135
+ @indentor = Indentor.const_get( name.upcase.to_sym )
136
+ end
137
+
138
+ end
139
+
140
+ parser.parse( args )
141
+ end
52
142
 
53
- puts tree.to_xml
54
143
  end
55
144
 
145
+ HTMLCleaner.new.run
56
146
  end
57
-
58
- HTMLCleaner.new.run
data/build/attributes CHANGED
@@ -21,6 +21,7 @@
21
21
  # Sources
22
22
  # http://www.w3.org/TR/xhtml11/
23
23
  # http://www.w3.org/TR/html4/
24
+ # http://dev.w3.org/html5/spec/
24
25
  # http://www.w3schools.com/tags/ref_standardattributes.asp
25
26
  # http://xhtml.com
26
27
 
@@ -28,6 +29,7 @@ CORE :: ALL except: base head html meta param script style title
28
29
  class ,*CORE
29
30
  id ,*CORE
30
31
  style ,*CORE
32
+ hidden ,*CORE, hidden element
31
33
  title ,CORE, extra title
32
34
 
33
35
  LANG :: ALL except: base br frame frameset hr iframe param
@@ -40,17 +42,17 @@ content ,meta, text
40
42
  scheme ,meta, format URI
41
43
 
42
44
  # Anchor and link attributes
43
- charset ,a link, char_encoding of link
45
+ charset ,a link meta, char_encoding of link or (meta) document
44
46
  coords ,*a, coordinates; i.e. image map
45
47
  hreflang ,link, language_code of referent
46
48
  href ,a base link, URL
47
- media ,link
49
+ media ,a area link
48
50
  name ,a, section_name anchor
49
51
  rel ,a link
50
52
  rev ,a link
51
53
  shape ,*a
52
54
  target ,*a *base *link
53
- type ,link
55
+ type ,a link
54
56
 
55
57
  # Image and some frame attributes
56
58
  src ,frame img
data/build/tags CHANGED
@@ -17,6 +17,7 @@
17
17
  # Sources
18
18
  # http://www.w3.org/TR/xhtml11/
19
19
  # http://www.w3.org/TR/html4/
20
+ # http://dev.w3.org/html5/spec/
20
21
  # http://www.w3schools.com/tags/default.asp
21
22
  # http://xhtml.com/
22
23
  #
@@ -25,106 +26,120 @@
25
26
  # S :: In Strict HTML 4.01/XHTML 1.0
26
27
  # T :: In Transitional HTML 4.01/XHTML 1.0
27
28
  # F :: In frameset annex
29
+ # 5 :: HTML5 new elements
28
30
  # D :: Deprecated
29
31
  # I :: Inline elements (Note <br/> is not labeled inline.)
30
32
  # M :: Metadata elements (content not visible text), i.e. head
31
33
  # B :: Banned/blacklisted elements from which text should not be extracted.
32
34
 
33
- a , S T F I , anchor
34
- abbr , S T F I , abbreviation
35
- acronym , S T F I , acronym
36
- address , S T F , contact information for the author or owner
37
- applet , T F D , embedded applet
38
- area ,E S T F , area inside an image-map
39
- b , S T F I , bold text
40
- base ,E S T F M , default address or a default target for all links on a page
41
- basefont ,E T F D I M , default font; color; or size for the text in a page
42
- bdo , S T F I , the text direction
43
- big , S T F I , big text
44
- blockquote , S T F , long quotation
45
- body , S T F , the document's body
46
- br ,E S T F , single line break
47
- button , S T F I B, push button
48
- caption , S T F , table caption
49
- center , T F D , centered text
50
- cite , S T F I , citation
51
- code , S T F I , computer code text
52
- col ,E S T F , attribute values for one or more columns in a table
53
- colgroup , S T F , group of columns in a table for formatting
54
- dd , S T F , description of a term in a definition list
55
- del , S T F I , deleted text
56
- dfn , S T F I , definition term
57
- dir , T F D , directory list
58
- div , S T F , section in a document
59
- dl , S T F , definition list
60
- dt , S T F , term (an item) in a definition list
61
- em , S T F I , emphasized text
62
- fieldset , S T F B, border around elements in a form
63
- font , T F D I , font; color; or size for text
64
- form , S T F , form for user input
65
- frame ,E F B, window (a frame) in a frameset
66
- frameset , F B, set of frames
67
- h1 , S T F , heading level 1
68
- h2 , S T F , heading level 2
69
- h3 , S T F , heading level 3
70
- h4 , S T F , heading level 4
71
- h5 , S T F , heading level 5
72
- h6 , S T F , heading level 6
73
- head , S T F M , information about the document
74
- hr ,E S T F , horizontal line
75
- html , S T F , document
76
- i , S T F I , italic text
77
- iframe , T F , nline frame
78
- img ,E S T F I , image
79
- input ,E S T F I B, input control
80
- ins , S T F I , inserted text
81
- isindex , T F D , searchable index related to a document
82
- kbd , S T F I , keyboard text
83
- label , S T F I B, label for an input element
84
- legend , S T F B, caption for a fieldset element
85
- li , S T F , list item
86
- link ,E S T F M , the relationship between a document and an external resource
87
- map , S T F I , image-map
88
- menu , T F D , menu list
89
- meta ,E S T F M , metadata
90
- noframes , T F B, alternate content where frames not supported
91
- noscript , S T F B, alternate content script not supported
92
- object , S T F I B, embedded object
93
- ol , S T F , ordered list
94
- optgroup , S T F B, group of related options in a select list
95
- option , S T F B, option in a select list
96
- p , S T F , paragraph
97
- param ,E S T F , parameter for an object
98
- pre , S T F , preformatted text
99
- q , S T F I , short quotation
100
- rb , , ruby base text
101
- rbc , , ruby base container (complex)
102
- rp , , ruby simple text container
103
- rt , , ruby annotation text
104
- rtc , , ruby text container (complex)
105
- ruby , I , ruby pronunciation aid
106
- s , T F D I , strikethrough text
107
- samp , S T F I , sample computer code
108
- script , S T F I B, client-side script
109
- select , S T F I B, select list (drop-down list)
110
- small , S T F I , small text
111
- span , S T F I , section in a document
112
- strike , T F D I , strikethrough text
113
- strong , S T F I , strong text
114
- style , S T F B, style information for a document
115
- sub , S T F I , subscripted text
116
- sup , S T F I , superscripted text
117
- table , S T F , table
118
- tbody , S T F , Groups the body content in a table
119
- td , S T F , cell in a table
120
- textarea , S T F I B, multi-line text input control
121
- tfoot , S T F , Groups the footer content in a table
122
- th , S T F , header cell in a table
123
- thead , S T F , Groups the header content in a table
124
- title , S T F M , the title of a document
125
- tr , S T F , row in a table
126
- tt , S T F I , teletype text
127
- u , T F D I , underlined text
128
- ul , S T F , unordered list
129
- var , S T F I , variable part of a text
130
- xmp , D , preformatted text
35
+ a , S T F 5 I , anchor
36
+ abbr , S T F 5 I , abbreviation
37
+ acronym , S T F I , acronym
38
+ address , S T F 5 , contact information for the author or owner
39
+ applet , T F D , embedded applet
40
+ area ,E S T F 5 , area inside an image-map
41
+ article , 5 , Structure: An independent content element
42
+ aside , 5 , Structure: Tengentially related content
43
+ b , S T F 5 I , bold text
44
+ base ,E S T F 5 M , default address or target for all links on a page
45
+ basefont ,E T F D I M , default font; color; or size for the text in a page
46
+ bdi , 5 I , Text isolated from surrounding for BIDI formatting
47
+ bdo , S T F 5 I , the text direction
48
+ big , S T F I , big text
49
+ blockquote , S T F 5 , long quotation
50
+ body , S T F 5 , the document's body
51
+ br ,E S T F 5 , single line break
52
+ button , S T F 5 I B, push button
53
+ caption , S T F 5 , table caption
54
+ center , T F D , centered text
55
+ cite , S T F 5 I , citation
56
+ code , S T F 5 I , computer code text
57
+ col ,E S T F 5 , attribute values for one or more columns in a table
58
+ colgroup , S T F 5 , group of columns in a table for formatting
59
+ dd , S T F 5 , description of a term in a definition list
60
+ del , S T F 5 I , deleted text
61
+ dfn , S T F 5 I , definition term
62
+ dir , T F D , directory list
63
+ div , S T F 5 , section in a document
64
+ dl , S T F 5 , definition list
65
+ dt , S T F 5 , term (an item) in a definition list
66
+ em , S T F 5 I , emphasized text
67
+ fieldset , S T F 5 B, border around elements in a form
68
+ figcaption , 5 , Structure: A figure caption
69
+ figure , 5 , Structure: Self contained content that can be moved.
70
+ font , T F D I , font; color; or size for text
71
+ footer , 5 , Structure: A footer of a section
72
+ form , S T F 5 , form for user input
73
+ frame ,E F B, window (a frame) in a frameset
74
+ frameset , F B, set of frames
75
+ h1 , S T F 5 , heading level 1
76
+ h2 , S T F 5 , heading level 2
77
+ h3 , S T F 5 , heading level 3
78
+ h4 , S T F 5 , heading level 4
79
+ h5 , S T F 5 , heading level 5
80
+ h6 , S T F 5 , heading level 6
81
+ head , S T F 5 M , information about the document
82
+ header , 5 , Structure: A header of a section
83
+ hgroup , 5 , Structure: A group of headings
84
+ hr ,E S T F 5 , horizontal line
85
+ html , S T F 5 , document
86
+ i , S T F 5 I , italic text
87
+ iframe , T F 5 , nline frame
88
+ img ,E S T F 5 I , image
89
+ input ,E S T F 5 I B, input control
90
+ ins , S T F 5 I , inserted text
91
+ isindex , T F D , searchable index related to a document
92
+ kbd , S T F 5 I , keyboard text
93
+ label , S T F 5 I B, label for an input element
94
+ legend , S T F 5 B, caption for a fieldset element
95
+ li , S T F 5 , list item
96
+ link ,E S T F 5 M , relationship between a document and an external resource
97
+ map , S T F 5 I , image-map
98
+ mark , 5 I , Text marked/highlighted for reference purposes
99
+ menu , T F 5 D , menu list
100
+ meta ,E S T F 5 M , metadata
101
+ nav , 5 , Structure: container for navigational links
102
+ noframes , T F B, alternate content where frames not supported
103
+ noscript , S T F 5 B, alternate content script not supported
104
+ object , S T F 5 I B, embedded object
105
+ ol , S T F 5 , ordered list
106
+ optgroup , S T F 5 B, group of related options in a select list
107
+ option , S T F 5 B, option in a select list
108
+ p , S T F 5 , paragraph
109
+ param ,E S T F 5 , parameter for an object
110
+ pre , S T F 5 , preformatted text
111
+ q , S T F 5 I , short quotation
112
+ rb , 5 , ruby base text
113
+ rbc , 5 , ruby base container (complex)
114
+ rp , 5 , ruby simple text container
115
+ rt , 5 , ruby annotation text
116
+ rtc , 5 , ruby text container (complex)
117
+ ruby , 5 I , ruby pronunciation aid
118
+ s , T F 5 D I , strikethrough text
119
+ samp , S T F 5 I , sample computer code
120
+ script , S T F 5 I B, client-side script
121
+ section , 5 , Structure: generic document/application section
122
+ select , S T F 5 I B, select list (drop-down list)
123
+ small , S T F 5 I , small text
124
+ span , S T F 5 I , section in a document
125
+ strike , T F D I , strikethrough text
126
+ strong , S T F 5 I , strong text
127
+ style , S T F 5 B, style information for a document
128
+ sub , S T F 5 I , subscripted text
129
+ sup , S T F 5 I , superscripted text
130
+ table , S T F 5 , table
131
+ tbody , S T F 5 , Groups the body content in a table
132
+ td , S T F 5 , cell in a table
133
+ textarea , S T F 5 I B, multi-line text input control
134
+ tfoot , S T F 5 , Groups the footer content in a table
135
+ th , S T F 5 , header cell in a table
136
+ thead , S T F 5 , Groups the header content in a table
137
+ time , 5 I , A date or time
138
+ title , S T F 5 M , the title of a document
139
+ tr , S T F 5 , row in a table
140
+ tt , S T F I , teletype text
141
+ u , T F 5 D I , underlined text
142
+ ul , S T F 5 , unordered list
143
+ var , S T F 5 I , variable part of a text
144
+ wbr ,E 5 I , A line break opportunity
145
+ xmp , D , preformatted text
@@ -16,6 +16,6 @@
16
16
 
17
17
  module Iudex
18
18
  module HTML
19
- VERSION = '1.2.b.0'
19
+ VERSION = '1.2.b.1'
20
20
  end
21
21
  end
data/pom.xml CHANGED
@@ -3,7 +3,7 @@
3
3
  <groupId>iudex</groupId>
4
4
  <artifactId>iudex-html</artifactId>
5
5
  <packaging>jar</packaging>
6
- <version>1.2.b.0</version>
6
+ <version>1.2.b.1</version>
7
7
  <name>Iudex HTML parsing/filtering and text extraction</name>
8
8
 
9
9
  <parent>
@@ -24,7 +24,7 @@
24
24
  <dependency>
25
25
  <groupId>com.gravitext</groupId>
26
26
  <artifactId>gravitext-xmlprod</artifactId>
27
- <version>[1.5,1.5.9999)</version>
27
+ <version>[1.5.1,1.5.9999)</version>
28
28
  </dependency>
29
29
 
30
30
  <dependency>
@@ -48,6 +48,11 @@ HTML
48
48
  assert_doc( alt, parse( alt, "UTF-8" ) )
49
49
  end
50
50
 
51
+ def test_meta_charset_rerun
52
+ alt = HTML_META.sub( /<meta .*\/>/, '<meta charset="utf-8"/>' )
53
+ assert_doc( alt, parse( alt, "ISO-8859-1" ) )
54
+ end
55
+
51
56
  HTML_SKIP_TAGS = <<HTML
52
57
  <html xmlns="http://www.w3.org/1999/xhtml">
53
58
  <head>
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: iudex-html
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease: 4
5
- version: 1.2.b.0
5
+ version: 1.2.b.1
6
6
  platform: java
7
7
  authors:
8
8
  - David Kellum
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2012-03-05 00:00:00 Z
13
+ date: 2012-06-01 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: iudex-core
@@ -41,7 +41,7 @@ dependencies:
41
41
  requirements:
42
42
  - - ~>
43
43
  - !ruby/object:Gem::Version
44
- version: 1.5.b
44
+ version: 1.5.1
45
45
  requirement: *id003
46
46
  prerelease: false
47
47
  type: :runtime
@@ -117,7 +117,7 @@ files:
117
117
  - test/test_stax_parser.rb
118
118
  - test/test_tree_walker.rb
119
119
  - test/test_word_counters.rb
120
- - lib/iudex-html/iudex-html-1.2.b.0.jar
120
+ - lib/iudex-html/iudex-html-1.2.b.1.jar
121
121
  homepage: http://github.com/dekellum/iudex
122
122
  licenses: []
123
123