iudex-html 1.2.b.0-java → 1.2.b.1-java

Sign up to get free protection for your applications and to get access to all the features.
data/History.rdoc CHANGED
@@ -1,3 +1,9 @@
1
+ === 1.2.b.1 (2012-5-31)
2
+ * Add support for HTML 5 (draft) tags, attributes
3
+ * Neko parser support for HTML 5 <meta charset>
4
+ * Neko parser keeps non-HTML attributes when skipBanned = false
5
+ * Add options, barc read support to iudex-html-clean
6
+
1
7
  === 1.2.b.0 (2012-3-4)
2
8
  * Upgrade to gravitext-xmlprod ~> 1.5.b
3
9
  * Fix duplicate attributes from Neko, last value wins.
data/Manifest.txt CHANGED
@@ -25,4 +25,4 @@ test/test_parse_filter.rb
25
25
  test/test_stax_parser.rb
26
26
  test/test_tree_walker.rb
27
27
  test/test_word_counters.rb
28
- lib/iudex-html/iudex-html-1.2.b.0.jar
28
+ lib/iudex-html/iudex-html-1.2.b.1.jar
data/bin/iudex-html-clean CHANGED
@@ -19,40 +19,128 @@
19
19
  $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
20
 
21
21
  require 'rubygems'
22
- require 'rjack-logback'
23
22
 
24
- require 'iudex-html'
25
- require 'iudex-filter/key_helper'
23
+ module IudexBinScript
24
+ require 'rjack-logback'
25
+ include RJack
26
26
 
27
- require 'gravitext-xmlprod/extensions'
27
+ Logback.config_console( :stderr => true )
28
28
 
29
- require 'java'
29
+ require 'iudex-html'
30
+ require 'iudex-filter/key_helper'
30
31
 
31
- class HTMLCleaner
32
- include Iudex::HTML
33
- include Iudex::HTML::Tree
34
- include Iudex::HTML::Filters
35
- include Iudex::HTML::Tree::Filters
32
+ require 'gravitext-xmlprod/extensions'
36
33
 
37
- import 'iudex.html.HTMLUtils'
38
- import 'iudex.html.tree.TreeWalker'
34
+ require 'java'
39
35
 
40
- def run( input = ARGV.first )
41
- source = HTMLUtils::source( IO.read( input ).to_java_bytes, "UTF-8" )
42
- tree = HTMLUtils::parse( source )
36
+ class HTMLCleaner
37
+ include Gravitext::XMLProd
38
+ include Iudex::Core
39
+ include Iudex::HTML
40
+ include Iudex::HTML::Tree
41
+ include Iudex::HTML::Filters
42
+ include Iudex::HTML::Tree::Filters
43
43
 
44
- tfc = TreeFilterChain.new( [ XmpToPreConverter.new,
45
- CSSDisplayFilter.new,
46
- AttributeCleaner.new,
47
- MojiBakeCleaner.new,
48
- CharactersNormalizer.new,
49
- EmptyInlineRemover.new ] )
44
+ import 'iudex.html.HTMLUtils'
45
+ import 'iudex.html.neko.NekoHTMLParser'
46
+ import 'iudex.html.tree.TreeWalker'
47
+ import 'iudex.http.Headers'
48
+ import 'iudex.util.Charsets'
50
49
 
51
- TreeWalker.walk_depth_first( tfc, tree )
50
+ def initialize
51
+ @default_encoding = "UTF-8"
52
+ @trim_non_displayed = false
53
+ @indentor = Indentor::PRETTY
54
+ end
55
+
56
+ def run( args = ARGV )
57
+ files = parse_args( args )
58
+
59
+ files.each do |f|
60
+ if f =~ /\.barc$/
61
+ process_barc( f )
62
+ else
63
+ process_file( f )
64
+ end
65
+ end
66
+
67
+ end
68
+
69
+ def process_file( fname )
70
+
71
+ input = if fname == '-'
72
+ $stdin.read
73
+ else
74
+ IO.read( fname )
75
+ end
76
+
77
+ source = HTMLUtils::source( input.to_java_bytes, @default_encoding )
78
+ process( source )
79
+ end
80
+
81
+ def process_barc( bname )
82
+ require 'iudex-barc' #FIXME: Undeclared
83
+ barc_file = Iudex::BARC::BARCFile.new( java.io.File.new( bname ) )
84
+ barc_reader = barc_file.reader
85
+ while( rec = barc_reader.next )
86
+ next unless rec.type.chr == 'H'
87
+ source = ContentSource.new( rec.body_input_stream )
88
+ ctype = Headers.content_type( rec.response_headers )
89
+ if ctype && ctype.charset
90
+ enc = Charsets.lookup( ctype.charset )
91
+ source.set_default_encoding( enc ) if enc
92
+ end
93
+ process( source )
94
+ end
95
+ end
96
+
97
+ def process( source )
98
+ parser = NekoHTMLParser.new
99
+ parser.skip_banned = @trim_non_displayed
100
+
101
+ tree = parser.parse( source )
102
+
103
+ filters = [ XmpToPreConverter.new,
104
+ ( [ CSSDisplayFilter.new,
105
+ AttributeCleaner.new ] if @trim_non_displayed ),
106
+ MojiBakeCleaner.new,
107
+ CharactersNormalizer.new,
108
+ EmptyInlineRemover.new ].flatten.compact
109
+
110
+ TreeWalker.walk_depth_first( TreeFilterChain.new( filters ),
111
+ tree )
112
+
113
+ puts tree.to_xml( :indentor => @indentor )
114
+ end
115
+
116
+ def parse_args( args = ARGV )
117
+ parser = OptionParser.new do |opts|
118
+ opts.banner =
119
+ "Usage: iudex-html-clean [options] (FILE|barc)...\n" +
120
+ "Options:\n"
121
+
122
+ opts.on( "-v", "--version", "Display version and exit" ) do
123
+ puts "iudex-html: #{Iudex::HTML::VERSION}"
124
+ exit 1
125
+ end
126
+
127
+ opts.on( "-t", "--trim-non-display",
128
+ "Trim banned/non-displayed elements from output" ) do
129
+ @trim_non_displayed = true
130
+ end
131
+
132
+ opts.on( "-i", "--indentor NAME",
133
+ "Specify indentor to use for output: " +
134
+ "PRETTY (default), COMPACT" ) do |name|
135
+ @indentor = Indentor.const_get( name.upcase.to_sym )
136
+ end
137
+
138
+ end
139
+
140
+ parser.parse( args )
141
+ end
52
142
 
53
- puts tree.to_xml
54
143
  end
55
144
 
145
+ HTMLCleaner.new.run
56
146
  end
57
-
58
- HTMLCleaner.new.run
data/build/attributes CHANGED
@@ -21,6 +21,7 @@
21
21
  # Sources
22
22
  # http://www.w3.org/TR/xhtml11/
23
23
  # http://www.w3.org/TR/html4/
24
+ # http://dev.w3.org/html5/spec/
24
25
  # http://www.w3schools.com/tags/ref_standardattributes.asp
25
26
  # http://xhtml.com
26
27
 
@@ -28,6 +29,7 @@ CORE :: ALL except: base head html meta param script style title
28
29
  class ,*CORE
29
30
  id ,*CORE
30
31
  style ,*CORE
32
+ hidden ,*CORE, hidden element
31
33
  title ,CORE, extra title
32
34
 
33
35
  LANG :: ALL except: base br frame frameset hr iframe param
@@ -40,17 +42,17 @@ content ,meta, text
40
42
  scheme ,meta, format URI
41
43
 
42
44
  # Anchor and link attributes
43
- charset ,a link, char_encoding of link
45
+ charset ,a link meta, char_encoding of link or (meta) document
44
46
  coords ,*a, coordinates; i.e. image map
45
47
  hreflang ,link, language_code of referent
46
48
  href ,a base link, URL
47
- media ,link
49
+ media ,a area link
48
50
  name ,a, section_name anchor
49
51
  rel ,a link
50
52
  rev ,a link
51
53
  shape ,*a
52
54
  target ,*a *base *link
53
- type ,link
55
+ type ,a link
54
56
 
55
57
  # Image and some frame attributes
56
58
  src ,frame img
data/build/tags CHANGED
@@ -17,6 +17,7 @@
17
17
  # Sources
18
18
  # http://www.w3.org/TR/xhtml11/
19
19
  # http://www.w3.org/TR/html4/
20
+ # http://dev.w3.org/html5/spec/
20
21
  # http://www.w3schools.com/tags/default.asp
21
22
  # http://xhtml.com/
22
23
  #
@@ -25,106 +26,120 @@
25
26
  # S :: In Strict HTML 4.01/XHTML 1.0
26
27
  # T :: In Transitional HTML 4.01/XHTML 1.0
27
28
  # F :: In frameset annex
29
+ # 5 :: HTML5 new elements
28
30
  # D :: Deprecated
29
31
  # I :: Inline elements (Note <br/> is not labeled inline.)
30
32
  # M :: Metadata elements (content not visible text), i.e. head
31
33
  # B :: Banned/blacklisted elements from which text should not be extracted.
32
34
 
33
- a , S T F I , anchor
34
- abbr , S T F I , abbreviation
35
- acronym , S T F I , acronym
36
- address , S T F , contact information for the author or owner
37
- applet , T F D , embedded applet
38
- area ,E S T F , area inside an image-map
39
- b , S T F I , bold text
40
- base ,E S T F M , default address or a default target for all links on a page
41
- basefont ,E T F D I M , default font; color; or size for the text in a page
42
- bdo , S T F I , the text direction
43
- big , S T F I , big text
44
- blockquote , S T F , long quotation
45
- body , S T F , the document's body
46
- br ,E S T F , single line break
47
- button , S T F I B, push button
48
- caption , S T F , table caption
49
- center , T F D , centered text
50
- cite , S T F I , citation
51
- code , S T F I , computer code text
52
- col ,E S T F , attribute values for one or more columns in a table
53
- colgroup , S T F , group of columns in a table for formatting
54
- dd , S T F , description of a term in a definition list
55
- del , S T F I , deleted text
56
- dfn , S T F I , definition term
57
- dir , T F D , directory list
58
- div , S T F , section in a document
59
- dl , S T F , definition list
60
- dt , S T F , term (an item) in a definition list
61
- em , S T F I , emphasized text
62
- fieldset , S T F B, border around elements in a form
63
- font , T F D I , font; color; or size for text
64
- form , S T F , form for user input
65
- frame ,E F B, window (a frame) in a frameset
66
- frameset , F B, set of frames
67
- h1 , S T F , heading level 1
68
- h2 , S T F , heading level 2
69
- h3 , S T F , heading level 3
70
- h4 , S T F , heading level 4
71
- h5 , S T F , heading level 5
72
- h6 , S T F , heading level 6
73
- head , S T F M , information about the document
74
- hr ,E S T F , horizontal line
75
- html , S T F , document
76
- i , S T F I , italic text
77
- iframe , T F , nline frame
78
- img ,E S T F I , image
79
- input ,E S T F I B, input control
80
- ins , S T F I , inserted text
81
- isindex , T F D , searchable index related to a document
82
- kbd , S T F I , keyboard text
83
- label , S T F I B, label for an input element
84
- legend , S T F B, caption for a fieldset element
85
- li , S T F , list item
86
- link ,E S T F M , the relationship between a document and an external resource
87
- map , S T F I , image-map
88
- menu , T F D , menu list
89
- meta ,E S T F M , metadata
90
- noframes , T F B, alternate content where frames not supported
91
- noscript , S T F B, alternate content script not supported
92
- object , S T F I B, embedded object
93
- ol , S T F , ordered list
94
- optgroup , S T F B, group of related options in a select list
95
- option , S T F B, option in a select list
96
- p , S T F , paragraph
97
- param ,E S T F , parameter for an object
98
- pre , S T F , preformatted text
99
- q , S T F I , short quotation
100
- rb , , ruby base text
101
- rbc , , ruby base container (complex)
102
- rp , , ruby simple text container
103
- rt , , ruby annotation text
104
- rtc , , ruby text container (complex)
105
- ruby , I , ruby pronunciation aid
106
- s , T F D I , strikethrough text
107
- samp , S T F I , sample computer code
108
- script , S T F I B, client-side script
109
- select , S T F I B, select list (drop-down list)
110
- small , S T F I , small text
111
- span , S T F I , section in a document
112
- strike , T F D I , strikethrough text
113
- strong , S T F I , strong text
114
- style , S T F B, style information for a document
115
- sub , S T F I , subscripted text
116
- sup , S T F I , superscripted text
117
- table , S T F , table
118
- tbody , S T F , Groups the body content in a table
119
- td , S T F , cell in a table
120
- textarea , S T F I B, multi-line text input control
121
- tfoot , S T F , Groups the footer content in a table
122
- th , S T F , header cell in a table
123
- thead , S T F , Groups the header content in a table
124
- title , S T F M , the title of a document
125
- tr , S T F , row in a table
126
- tt , S T F I , teletype text
127
- u , T F D I , underlined text
128
- ul , S T F , unordered list
129
- var , S T F I , variable part of a text
130
- xmp , D , preformatted text
35
+ a , S T F 5 I , anchor
36
+ abbr , S T F 5 I , abbreviation
37
+ acronym , S T F I , acronym
38
+ address , S T F 5 , contact information for the author or owner
39
+ applet , T F D , embedded applet
40
+ area ,E S T F 5 , area inside an image-map
41
+ article , 5 , Structure: An independent content element
42
+ aside , 5 , Structure: Tengentially related content
43
+ b , S T F 5 I , bold text
44
+ base ,E S T F 5 M , default address or target for all links on a page
45
+ basefont ,E T F D I M , default font; color; or size for the text in a page
46
+ bdi , 5 I , Text isolated from surrounding for BIDI formatting
47
+ bdo , S T F 5 I , the text direction
48
+ big , S T F I , big text
49
+ blockquote , S T F 5 , long quotation
50
+ body , S T F 5 , the document's body
51
+ br ,E S T F 5 , single line break
52
+ button , S T F 5 I B, push button
53
+ caption , S T F 5 , table caption
54
+ center , T F D , centered text
55
+ cite , S T F 5 I , citation
56
+ code , S T F 5 I , computer code text
57
+ col ,E S T F 5 , attribute values for one or more columns in a table
58
+ colgroup , S T F 5 , group of columns in a table for formatting
59
+ dd , S T F 5 , description of a term in a definition list
60
+ del , S T F 5 I , deleted text
61
+ dfn , S T F 5 I , definition term
62
+ dir , T F D , directory list
63
+ div , S T F 5 , section in a document
64
+ dl , S T F 5 , definition list
65
+ dt , S T F 5 , term (an item) in a definition list
66
+ em , S T F 5 I , emphasized text
67
+ fieldset , S T F 5 B, border around elements in a form
68
+ figcaption , 5 , Structure: A figure caption
69
+ figure , 5 , Structure: Self contained content that can be moved.
70
+ font , T F D I , font; color; or size for text
71
+ footer , 5 , Structure: A footer of a section
72
+ form , S T F 5 , form for user input
73
+ frame ,E F B, window (a frame) in a frameset
74
+ frameset , F B, set of frames
75
+ h1 , S T F 5 , heading level 1
76
+ h2 , S T F 5 , heading level 2
77
+ h3 , S T F 5 , heading level 3
78
+ h4 , S T F 5 , heading level 4
79
+ h5 , S T F 5 , heading level 5
80
+ h6 , S T F 5 , heading level 6
81
+ head , S T F 5 M , information about the document
82
+ header , 5 , Structure: A header of a section
83
+ hgroup , 5 , Structure: A group of headings
84
+ hr ,E S T F 5 , horizontal line
85
+ html , S T F 5 , document
86
+ i , S T F 5 I , italic text
87
+ iframe , T F 5 , nline frame
88
+ img ,E S T F 5 I , image
89
+ input ,E S T F 5 I B, input control
90
+ ins , S T F 5 I , inserted text
91
+ isindex , T F D , searchable index related to a document
92
+ kbd , S T F 5 I , keyboard text
93
+ label , S T F 5 I B, label for an input element
94
+ legend , S T F 5 B, caption for a fieldset element
95
+ li , S T F 5 , list item
96
+ link ,E S T F 5 M , relationship between a document and an external resource
97
+ map , S T F 5 I , image-map
98
+ mark , 5 I , Text marked/highlighted for reference purposes
99
+ menu , T F 5 D , menu list
100
+ meta ,E S T F 5 M , metadata
101
+ nav , 5 , Structure: container for navigational links
102
+ noframes , T F B, alternate content where frames not supported
103
+ noscript , S T F 5 B, alternate content script not supported
104
+ object , S T F 5 I B, embedded object
105
+ ol , S T F 5 , ordered list
106
+ optgroup , S T F 5 B, group of related options in a select list
107
+ option , S T F 5 B, option in a select list
108
+ p , S T F 5 , paragraph
109
+ param ,E S T F 5 , parameter for an object
110
+ pre , S T F 5 , preformatted text
111
+ q , S T F 5 I , short quotation
112
+ rb , 5 , ruby base text
113
+ rbc , 5 , ruby base container (complex)
114
+ rp , 5 , ruby simple text container
115
+ rt , 5 , ruby annotation text
116
+ rtc , 5 , ruby text container (complex)
117
+ ruby , 5 I , ruby pronunciation aid
118
+ s , T F 5 D I , strikethrough text
119
+ samp , S T F 5 I , sample computer code
120
+ script , S T F 5 I B, client-side script
121
+ section , 5 , Structure: generic document/application section
122
+ select , S T F 5 I B, select list (drop-down list)
123
+ small , S T F 5 I , small text
124
+ span , S T F 5 I , section in a document
125
+ strike , T F D I , strikethrough text
126
+ strong , S T F 5 I , strong text
127
+ style , S T F 5 B, style information for a document
128
+ sub , S T F 5 I , subscripted text
129
+ sup , S T F 5 I , superscripted text
130
+ table , S T F 5 , table
131
+ tbody , S T F 5 , Groups the body content in a table
132
+ td , S T F 5 , cell in a table
133
+ textarea , S T F 5 I B, multi-line text input control
134
+ tfoot , S T F 5 , Groups the footer content in a table
135
+ th , S T F 5 , header cell in a table
136
+ thead , S T F 5 , Groups the header content in a table
137
+ time , 5 I , A date or time
138
+ title , S T F 5 M , the title of a document
139
+ tr , S T F 5 , row in a table
140
+ tt , S T F I , teletype text
141
+ u , T F 5 D I , underlined text
142
+ ul , S T F 5 , unordered list
143
+ var , S T F 5 I , variable part of a text
144
+ wbr ,E 5 I , A line break opportunity
145
+ xmp , D , preformatted text
@@ -16,6 +16,6 @@
16
16
 
17
17
  module Iudex
18
18
  module HTML
19
- VERSION = '1.2.b.0'
19
+ VERSION = '1.2.b.1'
20
20
  end
21
21
  end
data/pom.xml CHANGED
@@ -3,7 +3,7 @@
3
3
  <groupId>iudex</groupId>
4
4
  <artifactId>iudex-html</artifactId>
5
5
  <packaging>jar</packaging>
6
- <version>1.2.b.0</version>
6
+ <version>1.2.b.1</version>
7
7
  <name>Iudex HTML parsing/filtering and text extraction</name>
8
8
 
9
9
  <parent>
@@ -24,7 +24,7 @@
24
24
  <dependency>
25
25
  <groupId>com.gravitext</groupId>
26
26
  <artifactId>gravitext-xmlprod</artifactId>
27
- <version>[1.5,1.5.9999)</version>
27
+ <version>[1.5.1,1.5.9999)</version>
28
28
  </dependency>
29
29
 
30
30
  <dependency>
@@ -48,6 +48,11 @@ HTML
48
48
  assert_doc( alt, parse( alt, "UTF-8" ) )
49
49
  end
50
50
 
51
+ def test_meta_charset_rerun
52
+ alt = HTML_META.sub( /<meta .*\/>/, '<meta charset="utf-8"/>' )
53
+ assert_doc( alt, parse( alt, "ISO-8859-1" ) )
54
+ end
55
+
51
56
  HTML_SKIP_TAGS = <<HTML
52
57
  <html xmlns="http://www.w3.org/1999/xhtml">
53
58
  <head>
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: iudex-html
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease: 4
5
- version: 1.2.b.0
5
+ version: 1.2.b.1
6
6
  platform: java
7
7
  authors:
8
8
  - David Kellum
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2012-03-05 00:00:00 Z
13
+ date: 2012-06-01 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: iudex-core
@@ -41,7 +41,7 @@ dependencies:
41
41
  requirements:
42
42
  - - ~>
43
43
  - !ruby/object:Gem::Version
44
- version: 1.5.b
44
+ version: 1.5.1
45
45
  requirement: *id003
46
46
  prerelease: false
47
47
  type: :runtime
@@ -117,7 +117,7 @@ files:
117
117
  - test/test_stax_parser.rb
118
118
  - test/test_tree_walker.rb
119
119
  - test/test_word_counters.rb
120
- - lib/iudex-html/iudex-html-1.2.b.0.jar
120
+ - lib/iudex-html/iudex-html-1.2.b.1.jar
121
121
  homepage: http://github.com/dekellum/iudex
122
122
  licenses: []
123
123